]> git.rkrishnan.org Git - tahoe-lafs/tahoe-lafs.git/blob - src/allmydata/test/test_encodingutil.py
Make platform-detection code tolerate linux-3.0, patch by zooko.
[tahoe-lafs/tahoe-lafs.git] / src / allmydata / test / test_encodingutil.py
1
2 lumiere_nfc = u"lumi\u00E8re"
3 Artonwall_nfc = u"\u00C4rtonwall.mp3"
4 Artonwall_nfd = u"A\u0308rtonwall.mp3"
5
6 TEST_FILENAMES = (
7   Artonwall_nfc,
8   u'test_file',
9   u'Blah blah.txt',
10 )
11
12 # The following main helps to generate a test class for other operating
13 # systems.
14
15 if __name__ == "__main__":
16     import sys, os
17     import tempfile
18     import shutil
19     import platform
20
21     if len(sys.argv) != 2:
22         print "Usage: %s lumi<e-grave>re" % sys.argv[0]
23         sys.exit(1)
24
25     if sys.platform == "win32":
26         try:
27             from allmydata.windows.fixups import initialize
28         except ImportError:
29             print "set PYTHONPATH to the src directory"
30             sys.exit(1)
31         initialize()
32
33     print
34     print "class MyWeirdOS(EncodingUtil, unittest.TestCase):"
35     print "    uname = '%s'" % ' '.join(platform.uname())
36     print "    argv = %s" % repr(sys.argv[1])
37     print "    platform = '%s'" % sys.platform
38     print "    filesystem_encoding = '%s'" % sys.getfilesystemencoding()
39     print "    io_encoding = '%s'" % sys.stdout.encoding
40     try:
41         tmpdir = tempfile.mkdtemp()
42         for fname in TEST_FILENAMES:
43             open(os.path.join(tmpdir, fname), 'w').close()
44
45         # Use Unicode API under Windows or MacOS X
46         if sys.platform in ('win32', 'darwin'):
47             dirlist = os.listdir(unicode(tmpdir))
48         else:
49             dirlist = os.listdir(tmpdir)
50
51         print "    dirlist = %s" % repr(dirlist)
52     except:
53         print "    # Oops, I cannot write filenames containing non-ascii characters"
54     print
55
56     shutil.rmtree(tmpdir)
57     sys.exit(0)
58
59 from twisted.trial import unittest
60 from mock import patch
61 import os, sys, locale
62
63 from allmydata.test.common_util import ReallyEqualMixin
64 from allmydata.util import encodingutil
65 from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
66     unicode_to_output, quote_output, unicode_platform, listdir_unicode, \
67     FilenameEncodingError, get_io_encoding, get_filesystem_encoding, _reload
68 from allmydata.dirnode import normalize
69
70 from twisted.python import usage
71
72 class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
73
74     @patch('sys.stdout')
75     def test_get_io_encoding(self, mock_stdout):
76         mock_stdout.encoding = 'UTF-8'
77         _reload()
78         self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
79
80         mock_stdout.encoding = 'cp65001'
81         _reload()
82         self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
83
84         mock_stdout.encoding = 'koi8-r'
85         expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
86         _reload()
87         self.failUnlessReallyEqual(get_io_encoding(), expected)
88
89         mock_stdout.encoding = 'nonexistent_encoding'
90         if sys.platform == "win32":
91             _reload()
92             self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
93         else:
94             self.failUnlessRaises(AssertionError, _reload)
95
96     @patch('locale.getpreferredencoding')
97     def test_get_io_encoding_not_from_stdout(self, mock_locale_getpreferredencoding):
98         locale  # hush pyflakes
99         mock_locale_getpreferredencoding.return_value = 'koi8-r'
100
101         class DummyStdout:
102             pass
103         old_stdout = sys.stdout
104         sys.stdout = DummyStdout()
105         try:
106             expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
107             _reload()
108             self.failUnlessReallyEqual(get_io_encoding(), expected)
109
110             sys.stdout.encoding = None
111             _reload()
112             self.failUnlessReallyEqual(get_io_encoding(), expected)
113
114             mock_locale_getpreferredencoding.return_value = None
115             _reload()
116             self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
117         finally:
118             sys.stdout = old_stdout
119
120     def test_argv_to_unicode(self):
121         encodingutil.io_encoding = 'utf-8'
122         self.failUnlessRaises(usage.UsageError,
123                               argv_to_unicode,
124                               lumiere_nfc.encode('latin1'))
125
126     def test_unicode_to_output(self):
127         encodingutil.io_encoding = 'koi8-r'
128         self.failUnlessRaises(UnicodeEncodeError, unicode_to_output, lumiere_nfc)
129
130     @patch('os.listdir')
131     def test_no_unicode_normalization(self, mock):
132         # Pretend to run on a Unicode platform.
133         # We normalized to NFC in 1.7beta, but we now don't.
134         orig_platform = sys.platform
135         try:
136             sys.platform = 'darwin'
137             mock.return_value = [Artonwall_nfd]
138             _reload()
139             self.failUnlessReallyEqual(listdir_unicode(u'/dummy'), [Artonwall_nfd])
140         finally:
141             sys.platform = orig_platform
142
143 # The following tests apply only to platforms that don't store filenames as
144 # Unicode entities on the filesystem.
145 class EncodingUtilNonUnicodePlatform(unittest.TestCase):
146     def setUp(self):
147         # Mock sys.platform because unicode_platform() uses it
148         self.original_platform = sys.platform
149         sys.platform = 'linux'
150
151     def tearDown(self):
152         sys.platform = self.original_platform
153         _reload()
154
155     @patch('sys.getfilesystemencoding')
156     @patch('os.listdir')
157     def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
158         # What happens if latin1-encoded filenames are encountered on an UTF-8
159         # filesystem?
160         mock_listdir.return_value = [
161             lumiere_nfc.encode('utf-8'),
162             lumiere_nfc.encode('latin1')]
163
164         mock_getfilesystemencoding.return_value = 'utf-8'
165         _reload()
166         self.failUnlessRaises(FilenameEncodingError,
167                               listdir_unicode,
168                               u'/dummy')
169
170         # We're trying to list a directory whose name cannot be represented in
171         # the filesystem encoding.  This should fail.
172         mock_getfilesystemencoding.return_value = 'ascii'
173         _reload()
174         self.failUnlessRaises(FilenameEncodingError,
175                               listdir_unicode,
176                               u'/' + lumiere_nfc)
177
178
179 class EncodingUtil(ReallyEqualMixin):
180     def setUp(self):
181         self.original_platform = sys.platform
182         sys.platform = self.platform
183
184     def tearDown(self):
185         sys.platform = self.original_platform
186         _reload()
187
188     @patch('sys.stdout')
189     def test_argv_to_unicode(self, mock):
190         if 'argv' not in dir(self):
191             return
192
193         mock.encoding = self.io_encoding
194         argu = lumiere_nfc
195         argv = self.argv
196         _reload()
197         self.failUnlessReallyEqual(argv_to_unicode(argv), argu)
198
199     def test_unicode_to_url(self):
200         self.failUnless(unicode_to_url(lumiere_nfc), "lumi\xc3\xa8re")
201
202     @patch('sys.stdout')
203     def test_unicode_to_output(self, mock):
204         if 'argv' not in dir(self):
205             return
206
207         mock.encoding = self.io_encoding
208         _reload()
209         self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), self.argv)
210
211     def test_unicode_platform(self):
212         matrix = {
213           'linux2': False,
214           'linux3': False,
215           'openbsd4': False,
216           'win32':  True,
217           'darwin': True,
218         }
219
220         _reload()
221         self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform])
222
223     @patch('sys.getfilesystemencoding')
224     @patch('os.listdir')
225     def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
226         if 'dirlist' not in dir(self):
227             return
228
229         try:
230             u"test".encode(self.filesystem_encoding)
231         except (LookupError, AttributeError):
232             raise unittest.SkipTest("This platform does not support the '%s' filesystem encoding "
233                                     "that we are testing for the benefit of a different platform."
234                                     % (self.filesystem_encoding,))
235
236         mock_listdir.return_value = self.dirlist
237         mock_getfilesystemencoding.return_value = self.filesystem_encoding
238
239         _reload()
240         filenames = listdir_unicode(u'/dummy')
241
242         self.failUnlessEqual(set([normalize(fname) for fname in filenames]),
243                              set(TEST_FILENAMES))
244
245
246 class StdlibUnicode(unittest.TestCase):
247     """This mainly tests that some of the stdlib functions support Unicode paths, but also that
248     listdir_unicode works for valid filenames."""
249
250     def skip_if_cannot_represent_filename(self, u):
251         enc = get_filesystem_encoding()
252         if not unicode_platform():
253             try:
254                 u.encode(enc)
255             except UnicodeEncodeError:
256                 raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")
257
258     def test_mkdir_open_exists_abspath_listdir_expanduser(self):
259         self.skip_if_cannot_represent_filename(lumiere_nfc)
260
261         try:
262             os.mkdir(lumiere_nfc)
263         except EnvironmentError, e:
264             raise unittest.SkipTest("%r\nIt is possible that the filesystem on which this test is being run "
265                                     "does not support Unicode, even though the platform does." % (e,))
266
267         fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt'
268         open(fn, 'wb').close()
269         self.failUnless(os.path.exists(fn))
270         self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn)))
271         filenames = listdir_unicode(lumiere_nfc)
272
273         # We only require that the listing includes a filename that is canonically equivalent
274         # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent).
275         self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames]))
276
277         expanded = os.path.expanduser("~/" + lumiere_nfc)
278         self.failIfIn("~", expanded)
279         self.failUnless(expanded.endswith(lumiere_nfc), expanded)
280
281     def test_open_unrepresentable(self):
282         if unicode_platform():
283             raise unittest.SkipTest("This test is not applicable to platforms that represent filenames as Unicode.")
284
285         enc = get_filesystem_encoding()
286         fn = u'\u2621.txt'
287         try:
288             fn.encode(enc)
289             raise unittest.SkipTest("This test cannot be run unless we know a filename that is not representable.")
290         except UnicodeEncodeError:
291             self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb')
292
293
294 class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
295     def tearDown(self):
296         _reload()
297
298     def _check(self, inp, out, enc, optional_quotes):
299         out2 = out
300         if optional_quotes:
301             out2 = out2[1:-1]
302         self.failUnlessReallyEqual(quote_output(inp, encoding=enc), out)
303         self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False), out2)
304         if out[0:2] == 'b"':
305             pass
306         elif isinstance(inp, str):
307             self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc), out)
308             self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False), out2)
309         else:
310             self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc), out)
311             self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False), out2)
312
313     def _test_quote_output_all(self, enc):
314         def check(inp, out, optional_quotes=False):
315             self._check(inp, out, enc, optional_quotes)
316
317         # optional single quotes
318         check("foo",  "'foo'",  True)
319         check("\\",   "'\\'",   True)
320         check("$\"`", "'$\"`'", True)
321
322         # mandatory single quotes
323         check("\"",   "'\"'")
324
325         # double quotes
326         check("'",    "\"'\"")
327         check("\n",   "\"\\x0a\"")
328         check("\x00", "\"\\x00\"")
329
330         # invalid Unicode and astral planes
331         check(u"\uFDD0\uFDEF",       "\"\\ufdd0\\ufdef\"")
332         check(u"\uDC00\uD800",       "\"\\udc00\\ud800\"")
333         check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
334         check(u"\uD800\uDC00",       "\"\\U00010000\"")
335         check(u"\uD800\uDC01",       "\"\\U00010001\"")
336         check(u"\uD801\uDC00",       "\"\\U00010400\"")
337         check(u"\uDBFF\uDFFF",       "\"\\U0010ffff\"")
338         check(u"'\uDBFF\uDFFF",      "\"'\\U0010ffff\"")
339         check(u"\"\uDBFF\uDFFF",     "\"\\\"\\U0010ffff\"")
340
341         # invalid UTF-8
342         check("\xFF",                "b\"\\xff\"")
343         check("\x00\"$\\`\x80\xFF",  "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
344
345     def test_quote_output_ascii(self, enc='ascii'):
346         def check(inp, out, optional_quotes=False):
347             self._check(inp, out, enc, optional_quotes)
348
349         self._test_quote_output_all(enc)
350         check(u"\u00D7",   "\"\\xd7\"")
351         check(u"'\u00D7",  "\"'\\xd7\"")
352         check(u"\"\u00D7", "\"\\\"\\xd7\"")
353         check(u"\u2621",   "\"\\u2621\"")
354         check(u"'\u2621",  "\"'\\u2621\"")
355         check(u"\"\u2621", "\"\\\"\\u2621\"")
356
357     def test_quote_output_latin1(self, enc='latin1'):
358         def check(inp, out, optional_quotes=False):
359             self._check(inp, out.encode('latin1'), enc, optional_quotes)
360
361         self._test_quote_output_all(enc)
362         check(u"\u00D7",   u"'\u00D7'", True)
363         check(u"'\u00D7",  u"\"'\u00D7\"")
364         check(u"\"\u00D7", u"'\"\u00D7'")
365         check(u"\u00D7\"", u"'\u00D7\"'", True)
366         check(u"\u2621",   u"\"\\u2621\"")
367         check(u"'\u2621",  u"\"'\\u2621\"")
368         check(u"\"\u2621", u"\"\\\"\\u2621\"")
369
370     def test_quote_output_utf8(self, enc='utf-8'):
371         def check(inp, out, optional_quotes=False):
372             self._check(inp, out.encode('utf-8'), enc, optional_quotes)
373
374         self._test_quote_output_all(enc)
375         check(u"\u2621",   u"'\u2621'", True)
376         check(u"'\u2621",  u"\"'\u2621\"")
377         check(u"\"\u2621", u"'\"\u2621'")
378         check(u"\u2621\"", u"'\u2621\"'", True)
379
380     def test_quote_output_default(self):
381         encodingutil.io_encoding = 'ascii'
382         self.test_quote_output_ascii(None)
383
384         encodingutil.io_encoding = 'latin1'
385         self.test_quote_output_latin1(None)
386
387         encodingutil.io_encoding = 'utf-8'
388         self.test_quote_output_utf8(None)
389
390
391 class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
392     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
393     argv = 'lumi\xc3\xa8re'
394     platform = 'linux2'
395     filesystem_encoding = 'UTF-8'
396     io_encoding = 'UTF-8'
397     dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
398
399 class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase):
400     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
401     argv = 'lumi\xe8re'
402     platform = 'linux2'
403     filesystem_encoding = 'ISO-8859-1'
404     io_encoding = 'ISO-8859-1'
405     dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
406
407 class Windows(EncodingUtil, unittest.TestCase):
408     uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
409     argv = 'lumi\xc3\xa8re'
410     platform = 'win32'
411     filesystem_encoding = 'mbcs'
412     io_encoding = 'utf-8'
413     dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
414
415 class MacOSXLeopard(EncodingUtil, unittest.TestCase):
416     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
417     output = 'lumi\xc3\xa8re'
418     platform = 'darwin'
419     filesystem_encoding = 'utf-8'
420     io_encoding = 'UTF-8'
421     dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
422
423 class MacOSXLeopard7bit(EncodingUtil, unittest.TestCase):
424     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
425     platform = 'darwin'
426     filesystem_encoding = 'utf-8'
427     io_encoding = 'US-ASCII'
428     dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
429
430 class OpenBSD(EncodingUtil, unittest.TestCase):
431     uname = 'OpenBSD 4.1 GENERIC#187 i386 Intel(R) Celeron(R) CPU 2.80GHz ("GenuineIntel" 686-class)'
432     platform = 'openbsd4'
433     filesystem_encoding = '646'
434     io_encoding = '646'
435     # Oops, I cannot write filenames containing non-ascii characters