]> git.rkrishnan.org Git - tahoe-lafs/tahoe-lafs.git/blob - src/allmydata/test/test_encodingutil.py
5613a1d6af721ec018e87d4f272f019ec02d053f
[tahoe-lafs/tahoe-lafs.git] / src / allmydata / test / test_encodingutil.py
1
2 lumiere_nfc = u"lumi\u00E8re"
3 Artonwall_nfc = u"\u00C4rtonwall.mp3"
4 Artonwall_nfd = u"A\u0308rtonwall.mp3"
5
6 TEST_FILENAMES = (
7   Artonwall_nfc,
8   u'test_file',
9   u'Blah blah.txt',
10 )
11
12 # The following main helps to generate a test class for other operating
13 # systems.
14
15 if __name__ == "__main__":
16     import sys, os
17     import tempfile
18     import shutil
19     import platform
20
21     if len(sys.argv) != 2:
22         print "Usage: %s lumi<e-grave>re" % sys.argv[0]
23         sys.exit(1)
24
25     if sys.platform == "win32":
26         try:
27             from allmydata.windows.fixups import initialize
28         except ImportError:
29             print "set PYTHONPATH to the src directory"
30             sys.exit(1)
31         initialize()
32
33     print
34     print "class MyWeirdOS(EncodingUtil, unittest.TestCase):"
35     print "    uname = '%s'" % ' '.join(platform.uname())
36     print "    argv = %s" % repr(sys.argv[1])
37     print "    platform = '%s'" % sys.platform
38     print "    filesystem_encoding = '%s'" % sys.getfilesystemencoding()
39     print "    io_encoding = '%s'" % sys.stdout.encoding
40     try:
41         tmpdir = tempfile.mkdtemp()
42         for fname in TEST_FILENAMES:
43             open(os.path.join(tmpdir, fname), 'w').close()
44
45         # Use Unicode API under Windows or MacOS X
46         if sys.platform in ('win32', 'darwin'):
47             dirlist = os.listdir(unicode(tmpdir))
48         else:
49             dirlist = os.listdir(tmpdir)
50
51         print "    dirlist = %s" % repr(dirlist)
52     except:
53         print "    # Oops, I cannot write filenames containing non-ascii characters"
54     print
55
56     shutil.rmtree(tmpdir)
57     sys.exit(0)
58
59 from twisted.trial import unittest
60 from mock import patch
61 import os, sys, locale
62
63 from allmydata.test.common_util import ReallyEqualMixin
64 from allmydata.util import encodingutil, fileutil
65 from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
66     unicode_to_output, quote_output, quote_path, quote_local_unicode_path, \
67     unicode_platform, listdir_unicode, FilenameEncodingError, get_io_encoding, \
68     get_filesystem_encoding, to_str, from_utf8_or_none, _reload
69 from allmydata.dirnode import normalize
70
71 from twisted.python import usage
72
73 class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
74
75     @patch('sys.stdout')
76     def test_get_io_encoding(self, mock_stdout):
77         mock_stdout.encoding = 'UTF-8'
78         _reload()
79         self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
80
81         mock_stdout.encoding = 'cp65001'
82         _reload()
83         self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
84
85         mock_stdout.encoding = 'koi8-r'
86         expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
87         _reload()
88         self.failUnlessReallyEqual(get_io_encoding(), expected)
89
90         mock_stdout.encoding = 'nonexistent_encoding'
91         if sys.platform == "win32":
92             _reload()
93             self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
94         else:
95             self.failUnlessRaises(AssertionError, _reload)
96
97     @patch('locale.getpreferredencoding')
98     def test_get_io_encoding_not_from_stdout(self, mock_locale_getpreferredencoding):
99         locale  # hush pyflakes
100         mock_locale_getpreferredencoding.return_value = 'koi8-r'
101
102         class DummyStdout:
103             pass
104         old_stdout = sys.stdout
105         sys.stdout = DummyStdout()
106         try:
107             expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
108             _reload()
109             self.failUnlessReallyEqual(get_io_encoding(), expected)
110
111             sys.stdout.encoding = None
112             _reload()
113             self.failUnlessReallyEqual(get_io_encoding(), expected)
114
115             mock_locale_getpreferredencoding.return_value = None
116             _reload()
117             self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
118         finally:
119             sys.stdout = old_stdout
120
121     def test_argv_to_unicode(self):
122         encodingutil.io_encoding = 'utf-8'
123         self.failUnlessRaises(usage.UsageError,
124                               argv_to_unicode,
125                               lumiere_nfc.encode('latin1'))
126
127     def test_unicode_to_output(self):
128         encodingutil.io_encoding = 'koi8-r'
129         self.failUnlessRaises(UnicodeEncodeError, unicode_to_output, lumiere_nfc)
130
131     @patch('os.listdir')
132     def test_no_unicode_normalization(self, mock):
133         # Pretend to run on a Unicode platform.
134         # We normalized to NFC in 1.7beta, but we now don't.
135         orig_platform = sys.platform
136         try:
137             sys.platform = 'darwin'
138             mock.return_value = [Artonwall_nfd]
139             _reload()
140             self.failUnlessReallyEqual(listdir_unicode(u'/dummy'), [Artonwall_nfd])
141         finally:
142             sys.platform = orig_platform
143
144 # The following tests apply only to platforms that don't store filenames as
145 # Unicode entities on the filesystem.
146 class EncodingUtilNonUnicodePlatform(unittest.TestCase):
147     def setUp(self):
148         # Mock sys.platform because unicode_platform() uses it
149         self.original_platform = sys.platform
150         sys.platform = 'linux'
151
152     def tearDown(self):
153         sys.platform = self.original_platform
154         _reload()
155
156     @patch('sys.getfilesystemencoding')
157     @patch('os.listdir')
158     def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
159         # What happens if latin1-encoded filenames are encountered on an UTF-8
160         # filesystem?
161         mock_listdir.return_value = [
162             lumiere_nfc.encode('utf-8'),
163             lumiere_nfc.encode('latin1')]
164
165         mock_getfilesystemencoding.return_value = 'utf-8'
166         _reload()
167         self.failUnlessRaises(FilenameEncodingError,
168                               listdir_unicode,
169                               u'/dummy')
170
171         # We're trying to list a directory whose name cannot be represented in
172         # the filesystem encoding.  This should fail.
173         mock_getfilesystemencoding.return_value = 'ascii'
174         _reload()
175         self.failUnlessRaises(FilenameEncodingError,
176                               listdir_unicode,
177                               u'/' + lumiere_nfc)
178
179
180 class EncodingUtil(ReallyEqualMixin):
181     def setUp(self):
182         self.original_platform = sys.platform
183         sys.platform = self.platform
184
185     def tearDown(self):
186         sys.platform = self.original_platform
187         _reload()
188
189     @patch('sys.stdout')
190     def test_argv_to_unicode(self, mock):
191         if 'argv' not in dir(self):
192             return
193
194         mock.encoding = self.io_encoding
195         argu = lumiere_nfc
196         argv = self.argv
197         _reload()
198         self.failUnlessReallyEqual(argv_to_unicode(argv), argu)
199
200     def test_unicode_to_url(self):
201         self.failUnless(unicode_to_url(lumiere_nfc), "lumi\xc3\xa8re")
202
203     @patch('sys.stdout')
204     def test_unicode_to_output(self, mock):
205         if 'argv' not in dir(self):
206             return
207
208         mock.encoding = self.io_encoding
209         _reload()
210         self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), self.argv)
211
212     def test_unicode_platform(self):
213         matrix = {
214           'linux2': False,
215           'linux3': False,
216           'openbsd4': False,
217           'win32':  True,
218           'darwin': True,
219         }
220
221         _reload()
222         self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform])
223
224     @patch('sys.getfilesystemencoding')
225     @patch('os.listdir')
226     def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
227         if 'dirlist' not in dir(self):
228             return
229
230         try:
231             u"test".encode(self.filesystem_encoding)
232         except (LookupError, AttributeError):
233             raise unittest.SkipTest("This platform does not support the '%s' filesystem encoding "
234                                     "that we are testing for the benefit of a different platform."
235                                     % (self.filesystem_encoding,))
236
237         mock_listdir.return_value = self.dirlist
238         mock_getfilesystemencoding.return_value = self.filesystem_encoding
239
240         _reload()
241         filenames = listdir_unicode(u'/dummy')
242
243         self.failUnlessEqual(set([normalize(fname) for fname in filenames]),
244                              set(TEST_FILENAMES))
245
246
247 class StdlibUnicode(unittest.TestCase):
248     """This mainly tests that some of the stdlib functions support Unicode paths, but also that
249     listdir_unicode works for valid filenames."""
250
251     def skip_if_cannot_represent_filename(self, u):
252         enc = get_filesystem_encoding()
253         if not unicode_platform():
254             try:
255                 u.encode(enc)
256             except UnicodeEncodeError:
257                 raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")
258
259     def test_mkdir_open_exists_abspath_listdir_expanduser(self):
260         self.skip_if_cannot_represent_filename(lumiere_nfc)
261
262         try:
263             os.mkdir(lumiere_nfc)
264         except EnvironmentError, e:
265             raise unittest.SkipTest("%r\nIt is possible that the filesystem on which this test is being run "
266                                     "does not support Unicode, even though the platform does." % (e,))
267
268         fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt'
269         open(fn, 'wb').close()
270         self.failUnless(os.path.exists(fn))
271         self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn)))
272         filenames = listdir_unicode(lumiere_nfc)
273
274         # We only require that the listing includes a filename that is canonically equivalent
275         # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent).
276         self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames]))
277
278         expanded = fileutil.expanduser(u"~/" + lumiere_nfc)
279         self.failIfIn(u"~", expanded)
280         self.failUnless(expanded.endswith(lumiere_nfc), expanded)
281
282     def test_open_unrepresentable(self):
283         if unicode_platform():
284             raise unittest.SkipTest("This test is not applicable to platforms that represent filenames as Unicode.")
285
286         enc = get_filesystem_encoding()
287         fn = u'\u2621.txt'
288         try:
289             fn.encode(enc)
290             raise unittest.SkipTest("This test cannot be run unless we know a filename that is not representable.")
291         except UnicodeEncodeError:
292             self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb')
293
294
295 class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
296     def tearDown(self):
297         _reload()
298
299     def _check(self, inp, out, enc, optional_quotes, quote_newlines):
300         out2 = out
301         if optional_quotes:
302             out2 = out2[1:-1]
303         self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quote_newlines=quote_newlines), out)
304         self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
305         if out[0:2] == 'b"':
306             pass
307         elif isinstance(inp, str):
308             self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quote_newlines=quote_newlines), out)
309             self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
310         else:
311             self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quote_newlines=quote_newlines), out)
312             self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
313
314     def _test_quote_output_all(self, enc):
315         def check(inp, out, optional_quotes=False, quote_newlines=None):
316             self._check(inp, out, enc, optional_quotes, quote_newlines)
317
318         # optional single quotes
319         check("foo",  "'foo'",  True)
320         check("\\",   "'\\'",   True)
321         check("$\"`", "'$\"`'", True)
322         check("\n",   "'\n'",   True, quote_newlines=False)
323
324         # mandatory single quotes
325         check("\"",   "'\"'")
326
327         # double quotes
328         check("'",    "\"'\"")
329         check("\n",   "\"\\x0a\"", quote_newlines=True)
330         check("\x00", "\"\\x00\"")
331
332         # invalid Unicode and astral planes
333         check(u"\uFDD0\uFDEF",       "\"\\ufdd0\\ufdef\"")
334         check(u"\uDC00\uD800",       "\"\\udc00\\ud800\"")
335         check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
336         check(u"\uD800\uDC00",       "\"\\U00010000\"")
337         check(u"\uD800\uDC01",       "\"\\U00010001\"")
338         check(u"\uD801\uDC00",       "\"\\U00010400\"")
339         check(u"\uDBFF\uDFFF",       "\"\\U0010ffff\"")
340         check(u"'\uDBFF\uDFFF",      "\"'\\U0010ffff\"")
341         check(u"\"\uDBFF\uDFFF",     "\"\\\"\\U0010ffff\"")
342
343         # invalid UTF-8
344         check("\xFF",                "b\"\\xff\"")
345         check("\x00\"$\\`\x80\xFF",  "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
346
347     def test_quote_output_ascii(self, enc='ascii'):
348         def check(inp, out, optional_quotes=False, quote_newlines=None):
349             self._check(inp, out, enc, optional_quotes, quote_newlines)
350
351         self._test_quote_output_all(enc)
352         check(u"\u00D7",   "\"\\xd7\"")
353         check(u"'\u00D7",  "\"'\\xd7\"")
354         check(u"\"\u00D7", "\"\\\"\\xd7\"")
355         check(u"\u2621",   "\"\\u2621\"")
356         check(u"'\u2621",  "\"'\\u2621\"")
357         check(u"\"\u2621", "\"\\\"\\u2621\"")
358         check(u"\n",       "'\n'",      True, quote_newlines=False)
359         check(u"\n",       "\"\\x0a\"", quote_newlines=True)
360
361     def test_quote_output_latin1(self, enc='latin1'):
362         def check(inp, out, optional_quotes=False, quote_newlines=None):
363             self._check(inp, out.encode('latin1'), enc, optional_quotes, quote_newlines)
364
365         self._test_quote_output_all(enc)
366         check(u"\u00D7",   u"'\u00D7'", True)
367         check(u"'\u00D7",  u"\"'\u00D7\"")
368         check(u"\"\u00D7", u"'\"\u00D7'")
369         check(u"\u00D7\"", u"'\u00D7\"'", True)
370         check(u"\u2621",   u"\"\\u2621\"")
371         check(u"'\u2621",  u"\"'\\u2621\"")
372         check(u"\"\u2621", u"\"\\\"\\u2621\"")
373         check(u"\n",       u"'\n'", True, quote_newlines=False)
374         check(u"\n",       u"\"\\x0a\"", quote_newlines=True)
375
376     def test_quote_output_utf8(self, enc='utf-8'):
377         def check(inp, out, optional_quotes=False, quote_newlines=None):
378             self._check(inp, out.encode('utf-8'), enc, optional_quotes, quote_newlines)
379
380         self._test_quote_output_all(enc)
381         check(u"\u2621",   u"'\u2621'", True)
382         check(u"'\u2621",  u"\"'\u2621\"")
383         check(u"\"\u2621", u"'\"\u2621'")
384         check(u"\u2621\"", u"'\u2621\"'", True)
385         check(u"\n",       u"'\n'", True, quote_newlines=False)
386         check(u"\n",       u"\"\\x0a\"", quote_newlines=True)
387
388     def test_quote_output_default(self):
389         encodingutil.io_encoding = 'ascii'
390         self.test_quote_output_ascii(None)
391
392         encodingutil.io_encoding = 'latin1'
393         self.test_quote_output_latin1(None)
394
395         encodingutil.io_encoding = 'utf-8'
396         self.test_quote_output_utf8(None)
397
398
399 class QuotePaths(ReallyEqualMixin, unittest.TestCase):
400     def test_quote_path(self):
401         self.failUnlessReallyEqual(quote_path([u'foo', u'bar']), "'foo/bar'")
402         self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=True), "'foo/bar'")
403         self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=False), "foo/bar")
404         self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar']), '"foo/\\x0abar"')
405         self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=True), '"foo/\\x0abar"')
406         self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=False), '"foo/\\x0abar"')
407
408         def win32_other(win32, other):
409             return win32 if sys.platform == "win32" else other
410
411         self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo"),
412                                    win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'"))
413         self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=True),
414                                    win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'"))
415         self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=False),
416                                    win32_other("C:\\foo", "\\\\?\\C:\\foo"))
417         self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar"),
418                                    win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'"))
419         self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=True),
420                                    win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'"))
421         self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=False),
422                                    win32_other("\\\\foo\\bar", "\\\\?\\UNC\\foo\\bar"))
423
424
425 class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
426     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
427     argv = 'lumi\xc3\xa8re'
428     platform = 'linux2'
429     filesystem_encoding = 'UTF-8'
430     io_encoding = 'UTF-8'
431     dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
432
433 class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase):
434     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
435     argv = 'lumi\xe8re'
436     platform = 'linux2'
437     filesystem_encoding = 'ISO-8859-1'
438     io_encoding = 'ISO-8859-1'
439     dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
440
441 class Windows(EncodingUtil, unittest.TestCase):
442     uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
443     argv = 'lumi\xc3\xa8re'
444     platform = 'win32'
445     filesystem_encoding = 'mbcs'
446     io_encoding = 'utf-8'
447     dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
448
449 class MacOSXLeopard(EncodingUtil, unittest.TestCase):
450     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
451     output = 'lumi\xc3\xa8re'
452     platform = 'darwin'
453     filesystem_encoding = 'utf-8'
454     io_encoding = 'UTF-8'
455     dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
456
457 class MacOSXLeopard7bit(EncodingUtil, unittest.TestCase):
458     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
459     platform = 'darwin'
460     filesystem_encoding = 'utf-8'
461     io_encoding = 'US-ASCII'
462     dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
463
464 class OpenBSD(EncodingUtil, unittest.TestCase):
465     uname = 'OpenBSD 4.1 GENERIC#187 i386 Intel(R) Celeron(R) CPU 2.80GHz ("GenuineIntel" 686-class)'
466     platform = 'openbsd4'
467     filesystem_encoding = '646'
468     io_encoding = '646'
469     # Oops, I cannot write filenames containing non-ascii characters
470
471
472 class TestToFromStr(ReallyEqualMixin, unittest.TestCase):
473     def test_to_str(self):
474         self.failUnlessReallyEqual(to_str("foo"), "foo")
475         self.failUnlessReallyEqual(to_str("lumi\xc3\xa8re"), "lumi\xc3\xa8re")
476         self.failUnlessReallyEqual(to_str("\xFF"), "\xFF")  # passes through invalid UTF-8 -- is this what we want?
477         self.failUnlessReallyEqual(to_str(u"lumi\u00E8re"), "lumi\xc3\xa8re")
478         self.failUnlessReallyEqual(to_str(None), None)
479
480     def test_from_utf8_or_none(self):
481         self.failUnlessRaises(AssertionError, from_utf8_or_none, u"foo")
482         self.failUnlessReallyEqual(from_utf8_or_none("lumi\xc3\xa8re"), u"lumi\u00E8re")
483         self.failUnlessReallyEqual(from_utf8_or_none(None), None)
484         self.failUnlessRaises(UnicodeDecodeError, from_utf8_or_none, "\xFF")