]> git.rkrishnan.org Git - tahoe-lafs/tahoe-lafs.git/blob - src/allmydata/test/test_encodingutil.py
96db341026071e559dcdb65a265e004687624f86
[tahoe-lafs/tahoe-lafs.git] / src / allmydata / test / test_encodingutil.py
1
2 lumiere_nfc = u"lumi\u00E8re"
3 Artonwall_nfc = u"\u00C4rtonwall.mp3"
4 Artonwall_nfd = u"A\u0308rtonwall.mp3"
5
6 TEST_FILENAMES = (
7   Artonwall_nfc,
8   u'test_file',
9   u'Blah blah.txt',
10 )
11
12 # The following main helps to generate a test class for other operating
13 # systems.
14
15 if __name__ == "__main__":
16     import sys, os
17     import tempfile
18     import shutil
19     import platform
20
21     if len(sys.argv) != 2:
22         print "Usage: %s lumi<e-grave>re" % sys.argv[0]
23         sys.exit(1)
24
25     if sys.platform == "win32":
26         try:
27             from allmydata.windows.fixups import initialize
28         except ImportError:
29             print "set PYTHONPATH to the src directory"
30             sys.exit(1)
31         initialize()
32
33     print
34     print "class MyWeirdOS(EncodingUtil, unittest.TestCase):"
35     print "    uname = '%s'" % ' '.join(platform.uname())
36     print "    argv = %s" % repr(sys.argv[1])
37     print "    platform = '%s'" % sys.platform
38     print "    filesystem_encoding = '%s'" % sys.getfilesystemencoding()
39     print "    io_encoding = '%s'" % sys.stdout.encoding
40     try:
41         tmpdir = tempfile.mkdtemp()
42         for fname in TEST_FILENAMES:
43             open(os.path.join(tmpdir, fname), 'w').close()
44
45         # Use Unicode API under Windows or MacOS X
46         if sys.platform in ('win32', 'darwin'):
47             dirlist = os.listdir(unicode(tmpdir))
48         else:
49             dirlist = os.listdir(tmpdir)
50
51         print "    dirlist = %s" % repr(dirlist)
52     except:
53         print "    # Oops, I cannot write filenames containing non-ascii characters"
54     print
55
56     shutil.rmtree(tmpdir)
57     sys.exit(0)
58
59 from twisted.trial import unittest
60 from mock import patch
61 import os, sys, locale
62
63 from allmydata.test.common_util import ReallyEqualMixin
64 from allmydata.util import encodingutil
65 from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
66     unicode_to_output, quote_output, unicode_platform, listdir_unicode, \
67     FilenameEncodingError, get_io_encoding, get_filesystem_encoding, _reload
68 from allmydata.dirnode import normalize
69
70 from twisted.python import usage
71
72 class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
73
74     @patch('sys.stdout')
75     def test_get_io_encoding(self, mock_stdout):
76         mock_stdout.encoding = 'UTF-8'
77         _reload()
78         self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
79
80         mock_stdout.encoding = 'cp65001'
81         _reload()
82         self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
83
84         mock_stdout.encoding = 'koi8-r'
85         expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
86         _reload()
87         self.failUnlessReallyEqual(get_io_encoding(), expected)
88
89         mock_stdout.encoding = 'nonexistent_encoding'
90         if sys.platform == "win32":
91             _reload()
92             self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
93         else:
94             self.failUnlessRaises(AssertionError, _reload)
95
96     @patch('locale.getpreferredencoding')
97     def test_get_io_encoding_not_from_stdout(self, mock_locale_getpreferredencoding):
98         locale  # hush pyflakes
99         mock_locale_getpreferredencoding.return_value = 'koi8-r'
100
101         class DummyStdout:
102             pass
103         old_stdout = sys.stdout
104         sys.stdout = DummyStdout()
105         try:
106             expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
107             _reload()
108             self.failUnlessReallyEqual(get_io_encoding(), expected)
109
110             sys.stdout.encoding = None
111             _reload()
112             self.failUnlessReallyEqual(get_io_encoding(), expected)
113
114             mock_locale_getpreferredencoding.return_value = None
115             _reload()
116             self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
117         finally:
118             sys.stdout = old_stdout
119
120     def test_argv_to_unicode(self):
121         encodingutil.io_encoding = 'utf-8'
122         self.failUnlessRaises(usage.UsageError,
123                               argv_to_unicode,
124                               lumiere_nfc.encode('latin1'))
125
126     def test_unicode_to_output(self):
127         encodingutil.io_encoding = 'koi8-r'
128         self.failUnlessRaises(UnicodeEncodeError, unicode_to_output, lumiere_nfc)
129
130     @patch('os.listdir')
131     def test_no_unicode_normalization(self, mock):
132         # Pretend to run on a Unicode platform.
133         # We normalized to NFC in 1.7beta, but we now don't.
134         orig_platform = sys.platform
135         try:
136             sys.platform = 'darwin'
137             mock.return_value = [Artonwall_nfd]
138             _reload()
139             self.failUnlessReallyEqual(listdir_unicode(u'/dummy'), [Artonwall_nfd])
140         finally:
141             sys.platform = orig_platform
142
143 # The following tests apply only to platforms that don't store filenames as
144 # Unicode entities on the filesystem.
145 class EncodingUtilNonUnicodePlatform(unittest.TestCase):
146     def setUp(self):
147         # Mock sys.platform because unicode_platform() uses it
148         self.original_platform = sys.platform
149         sys.platform = 'linux'
150
151     def tearDown(self):
152         sys.platform = self.original_platform
153         _reload()
154
155     @patch('sys.getfilesystemencoding')
156     @patch('os.listdir')
157     def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
158         # What happens if latin1-encoded filenames are encountered on an UTF-8
159         # filesystem?
160         mock_listdir.return_value = [
161             lumiere_nfc.encode('utf-8'),
162             lumiere_nfc.encode('latin1')]
163
164         mock_getfilesystemencoding.return_value = 'utf-8'
165         _reload()
166         self.failUnlessRaises(FilenameEncodingError,
167                               listdir_unicode,
168                               u'/dummy')
169
170         # We're trying to list a directory whose name cannot be represented in
171         # the filesystem encoding.  This should fail.
172         mock_getfilesystemencoding.return_value = 'ascii'
173         _reload()
174         self.failUnlessRaises(FilenameEncodingError,
175                               listdir_unicode,
176                               u'/' + lumiere_nfc)
177
178
179 class EncodingUtil(ReallyEqualMixin):
180     def setUp(self):
181         self.original_platform = sys.platform
182         sys.platform = self.platform
183
184     def tearDown(self):
185         sys.platform = self.original_platform
186         _reload()
187
188     @patch('sys.stdout')
189     def test_argv_to_unicode(self, mock):
190         if 'argv' not in dir(self):
191             return
192
193         mock.encoding = self.io_encoding
194         argu = lumiere_nfc
195         argv = self.argv
196         _reload()
197         self.failUnlessReallyEqual(argv_to_unicode(argv), argu)
198
199     def test_unicode_to_url(self):
200         self.failUnless(unicode_to_url(lumiere_nfc), "lumi\xc3\xa8re")
201
202     @patch('sys.stdout')
203     def test_unicode_to_output(self, mock):
204         if 'argv' not in dir(self):
205             return
206
207         mock.encoding = self.io_encoding
208         _reload()
209         self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), self.argv)
210
211     def test_unicode_platform(self):
212         matrix = {
213           'linux2': False,
214           'openbsd4': False,
215           'win32':  True,
216           'darwin': True,
217         }
218
219         _reload()
220         self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform])
221
222     @patch('sys.getfilesystemencoding')
223     @patch('os.listdir')
224     def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
225         if 'dirlist' not in dir(self):
226             return
227
228         try:
229             u"test".encode(self.filesystem_encoding)
230         except (LookupError, AttributeError):
231             raise unittest.SkipTest("This platform does not support the '%s' filesystem encoding "
232                                     "that we are testing for the benefit of a different platform."
233                                     % (self.filesystem_encoding,))
234
235         mock_listdir.return_value = self.dirlist
236         mock_getfilesystemencoding.return_value = self.filesystem_encoding
237
238         _reload()
239         filenames = listdir_unicode(u'/dummy')
240
241         self.failUnlessEqual(set([normalize(fname) for fname in filenames]),
242                              set(TEST_FILENAMES))
243
244
245 class StdlibUnicode(unittest.TestCase):
246     """This mainly tests that some of the stdlib functions support Unicode paths, but also that
247     listdir_unicode works for valid filenames."""
248
249     def skip_if_cannot_represent_filename(self, u):
250         enc = get_filesystem_encoding()
251         if not unicode_platform():
252             try:
253                 u.encode(enc)
254             except UnicodeEncodeError:
255                 raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")
256
257     def test_mkdir_open_exists_abspath_listdir_expanduser(self):
258         self.skip_if_cannot_represent_filename(lumiere_nfc)
259
260         try:
261             os.mkdir(lumiere_nfc)
262         except EnvironmentError, e:
263             raise unittest.SkipTest("%r\nIt is possible that the filesystem on which this test is being run "
264                                     "does not support Unicode, even though the platform does." % (e,))
265
266         fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt'
267         open(fn, 'wb').close()
268         self.failUnless(os.path.exists(fn))
269         self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn)))
270         filenames = listdir_unicode(lumiere_nfc)
271
272         # We only require that the listing includes a filename that is canonically equivalent
273         # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent).
274         self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames]))
275
276         expanded = os.path.expanduser("~/" + lumiere_nfc)
277         self.failIfIn("~", expanded)
278         self.failUnless(expanded.endswith(lumiere_nfc), expanded)
279
280     def test_open_unrepresentable(self):
281         if unicode_platform():
282             raise unittest.SkipTest("This test is not applicable to platforms that represent filenames as Unicode.")
283
284         enc = get_filesystem_encoding()
285         fn = u'\u2621.txt'
286         try:
287             fn.encode(enc)
288             raise unittest.SkipTest("This test cannot be run unless we know a filename that is not representable.")
289         except UnicodeEncodeError:
290             self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb')
291
292
293 class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
294     def tearDown(self):
295         _reload()
296
297     def _check(self, inp, out, enc, optional_quotes):
298         out2 = out
299         if optional_quotes:
300             out2 = out2[1:-1]
301         self.failUnlessReallyEqual(quote_output(inp, encoding=enc), out)
302         self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False), out2)
303         if out[0:2] == 'b"':
304             pass
305         elif isinstance(inp, str):
306             self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc), out)
307             self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False), out2)
308         else:
309             self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc), out)
310             self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False), out2)
311
312     def _test_quote_output_all(self, enc):
313         def check(inp, out, optional_quotes=False):
314             self._check(inp, out, enc, optional_quotes)
315
316         # optional single quotes
317         check("foo",  "'foo'",  True)
318         check("\\",   "'\\'",   True)
319         check("$\"`", "'$\"`'", True)
320
321         # mandatory single quotes
322         check("\"",   "'\"'")
323
324         # double quotes
325         check("'",    "\"'\"")
326         check("\n",   "\"\\x0a\"")
327         check("\x00", "\"\\x00\"")
328
329         # invalid Unicode and astral planes
330         check(u"\uFDD0\uFDEF",       "\"\\ufdd0\\ufdef\"")
331         check(u"\uDC00\uD800",       "\"\\udc00\\ud800\"")
332         check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
333         check(u"\uD800\uDC00",       "\"\\U00010000\"")
334         check(u"\uD800\uDC01",       "\"\\U00010001\"")
335         check(u"\uD801\uDC00",       "\"\\U00010400\"")
336         check(u"\uDBFF\uDFFF",       "\"\\U0010ffff\"")
337         check(u"'\uDBFF\uDFFF",      "\"'\\U0010ffff\"")
338         check(u"\"\uDBFF\uDFFF",     "\"\\\"\\U0010ffff\"")
339
340         # invalid UTF-8
341         check("\xFF",                "b\"\\xff\"")
342         check("\x00\"$\\`\x80\xFF",  "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
343
344     def test_quote_output_ascii(self, enc='ascii'):
345         def check(inp, out, optional_quotes=False):
346             self._check(inp, out, enc, optional_quotes)
347
348         self._test_quote_output_all(enc)
349         check(u"\u00D7",   "\"\\xd7\"")
350         check(u"'\u00D7",  "\"'\\xd7\"")
351         check(u"\"\u00D7", "\"\\\"\\xd7\"")
352         check(u"\u2621",   "\"\\u2621\"")
353         check(u"'\u2621",  "\"'\\u2621\"")
354         check(u"\"\u2621", "\"\\\"\\u2621\"")
355
356     def test_quote_output_latin1(self, enc='latin1'):
357         def check(inp, out, optional_quotes=False):
358             self._check(inp, out.encode('latin1'), enc, optional_quotes)
359
360         self._test_quote_output_all(enc)
361         check(u"\u00D7",   u"'\u00D7'", True)
362         check(u"'\u00D7",  u"\"'\u00D7\"")
363         check(u"\"\u00D7", u"'\"\u00D7'")
364         check(u"\u00D7\"", u"'\u00D7\"'", True)
365         check(u"\u2621",   u"\"\\u2621\"")
366         check(u"'\u2621",  u"\"'\\u2621\"")
367         check(u"\"\u2621", u"\"\\\"\\u2621\"")
368
369     def test_quote_output_utf8(self, enc='utf-8'):
370         def check(inp, out, optional_quotes=False):
371             self._check(inp, out.encode('utf-8'), enc, optional_quotes)
372
373         self._test_quote_output_all(enc)
374         check(u"\u2621",   u"'\u2621'", True)
375         check(u"'\u2621",  u"\"'\u2621\"")
376         check(u"\"\u2621", u"'\"\u2621'")
377         check(u"\u2621\"", u"'\u2621\"'", True)
378
379     def test_quote_output_default(self):
380         encodingutil.io_encoding = 'ascii'
381         self.test_quote_output_ascii(None)
382
383         encodingutil.io_encoding = 'latin1'
384         self.test_quote_output_latin1(None)
385
386         encodingutil.io_encoding = 'utf-8'
387         self.test_quote_output_utf8(None)
388
389
390 class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
391     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
392     argv = 'lumi\xc3\xa8re'
393     platform = 'linux2'
394     filesystem_encoding = 'UTF-8'
395     io_encoding = 'UTF-8'
396     dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
397
398 class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase):
399     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
400     argv = 'lumi\xe8re'
401     platform = 'linux2'
402     filesystem_encoding = 'ISO-8859-1'
403     io_encoding = 'ISO-8859-1'
404     dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
405
406 class Windows(EncodingUtil, unittest.TestCase):
407     uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
408     argv = 'lumi\xc3\xa8re'
409     platform = 'win32'
410     filesystem_encoding = 'mbcs'
411     io_encoding = 'utf-8'
412     dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
413
414 class MacOSXLeopard(EncodingUtil, unittest.TestCase):
415     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
416     output = 'lumi\xc3\xa8re'
417     platform = 'darwin'
418     filesystem_encoding = 'utf-8'
419     io_encoding = 'UTF-8'
420     dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
421
422 class MacOSXLeopard7bit(EncodingUtil, unittest.TestCase):
423     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
424     platform = 'darwin'
425     filesystem_encoding = 'utf-8'
426     io_encoding = 'US-ASCII'
427     dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
428
429 class OpenBSD(EncodingUtil, unittest.TestCase):
430     uname = 'OpenBSD 4.1 GENERIC#187 i386 Intel(R) Celeron(R) CPU 2.80GHz ("GenuineIntel" 686-class)'
431     platform = 'openbsd4'
432     filesystem_encoding = '646'
433     io_encoding = '646'
434     # Oops, I cannot write filenames containing non-ascii characters