]> git.rkrishnan.org Git - tahoe-lafs/tahoe-lafs.git/blob - src/allmydata/test/test_encodingutil.py
Changes to Tahoe needed to work with new zetuptoolz (that does not use .exe wrappers...
[tahoe-lafs/tahoe-lafs.git] / src / allmydata / test / test_encodingutil.py
1
2 lumiere_nfc = u"lumi\u00E8re"
3 Artonwall_nfc = u"\u00C4rtonwall.mp3"
4 Artonwall_nfd = u"A\u0308rtonwall.mp3"
5
6 TEST_FILENAMES = (
7   Artonwall_nfc,
8   u'test_file',
9   u'Blah blah.txt',
10 )
11
12 # The following main helps to generate a test class for other operating
13 # systems.
14
15 if __name__ == "__main__":
16     import sys, os
17     import tempfile
18     import shutil
19     import platform
20
21     if len(sys.argv) != 2:
22         print "Usage: %s lumi<e-grave>re" % sys.argv[0]
23         sys.exit(1)
24
25     if sys.platform == "win32":
26         try:
27             from allmydata.windows.fixups import initialize
28         except ImportError:
29             print "set PYTHONPATH to the src directory"
30             sys.exit(1)
31         initialize()
32
33     print
34     print "class MyWeirdOS(EncodingUtil, unittest.TestCase):"
35     print "    uname = '%s'" % ' '.join(platform.uname())
36     print "    argv = %s" % repr(sys.argv[1])
37     print "    platform = '%s'" % sys.platform
38     print "    filesystem_encoding = '%s'" % sys.getfilesystemencoding()
39     print "    output_encoding = '%s'" % sys.stdout.encoding
40     print "    argv_encoding = '%s'" % sys.stdout.encoding
41     try:
42         tmpdir = tempfile.mkdtemp()
43         for fname in TEST_FILENAMES:
44             open(os.path.join(tmpdir, fname), 'w').close() 
45
46         # Use Unicode API under Windows or MacOS X
47         if sys.platform in ('win32', 'darwin'):
48             dirlist = os.listdir(unicode(tmpdir))
49         else:
50             dirlist = os.listdir(tmpdir)
51
52         print "    dirlist = %s" % repr(dirlist)
53     except:
54         print "    # Oops, I cannot write filenames containing non-ascii characters"
55     print
56
57     shutil.rmtree(tmpdir)
58     sys.exit(0)
59
60 from twisted.trial import unittest
61 from mock import patch
62 import os, sys, locale
63
64 from allmydata.test.common_util import ReallyEqualMixin
65 from allmydata.util import encodingutil
66 from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
67     unicode_to_output, quote_output, unicode_platform, listdir_unicode, \
68     FilenameEncodingError, get_output_encoding, get_filesystem_encoding, _reload
69 from allmydata.dirnode import normalize
70
71 from twisted.python import usage
72
73 class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
74
75     @patch('sys.stdout')
76     def test_get_output_encoding(self, mock_stdout):
77         mock_stdout.encoding = 'UTF-8'
78         _reload()
79         self.failUnlessReallyEqual(get_output_encoding(), 'utf-8')
80
81         mock_stdout.encoding = 'cp65001'
82         _reload()
83         self.failUnlessReallyEqual(get_output_encoding(), 'utf-8')
84
85         mock_stdout.encoding = 'koi8-r'
86         expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
87         _reload()
88         self.failUnlessReallyEqual(get_output_encoding(), expected)
89
90         mock_stdout.encoding = 'nonexistent_encoding'
91         if sys.platform == "win32":
92             _reload()
93             self.failUnlessReallyEqual(get_output_encoding(), 'utf-8')
94         else:
95             self.failUnlessRaises(AssertionError, _reload)
96
97     @patch('locale.getpreferredencoding')
98     def test_get_output_encoding_not_from_stdout(self, mock_locale_getpreferredencoding):
99         locale  # hush pyflakes
100         mock_locale_getpreferredencoding.return_value = 'koi8-r'
101
102         class DummyStdout:
103             pass
104         old_stdout = sys.stdout
105         sys.stdout = DummyStdout()
106         try:
107             expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
108             _reload()
109             self.failUnlessReallyEqual(get_output_encoding(), expected)
110
111             sys.stdout.encoding = None
112             _reload()
113             self.failUnlessReallyEqual(get_output_encoding(), expected)
114
115             mock_locale_getpreferredencoding.return_value = None
116             _reload()
117             self.failUnlessReallyEqual(get_output_encoding(), 'utf-8')
118         finally:
119             sys.stdout = old_stdout
120
121     def test_argv_to_unicode(self):
122         encodingutil.output_encoding = 'utf-8'
123         self.failUnlessRaises(usage.UsageError,
124                               argv_to_unicode,
125                               lumiere_nfc.encode('latin1'))
126
127     def test_unicode_to_output(self):
128         encodingutil.output_encoding = 'koi8-r'
129         self.failUnlessRaises(UnicodeEncodeError, unicode_to_output, lumiere_nfc)
130
131     @patch('os.listdir')
132     def test_no_unicode_normalization(self, mock):
133         # Pretend to run on a Unicode platform.
134         # We normalized to NFC in 1.7beta, but we now don't.
135         orig_platform = sys.platform
136         try:
137             sys.platform = 'darwin'
138             mock.return_value = [Artonwall_nfd]
139             _reload()
140             self.failUnlessReallyEqual(listdir_unicode(u'/dummy'), [Artonwall_nfd])
141         finally:
142             sys.platform = orig_platform
143
144 # The following tests apply only to platforms that don't store filenames as
145 # Unicode entities on the filesystem.
146 class EncodingUtilNonUnicodePlatform(unittest.TestCase):
147     def setUp(self):
148         # Mock sys.platform because unicode_platform() uses it
149         self.original_platform = sys.platform
150         sys.platform = 'linux'
151
152     def tearDown(self):
153         sys.platform = self.original_platform
154         _reload()
155
156     @patch('sys.getfilesystemencoding')
157     @patch('os.listdir')
158     def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
159         # What happens if latin1-encoded filenames are encountered on an UTF-8
160         # filesystem?
161         mock_listdir.return_value = [
162             lumiere_nfc.encode('utf-8'),
163             lumiere_nfc.encode('latin1')]
164
165         mock_getfilesystemencoding.return_value = 'utf-8'
166         _reload()
167         self.failUnlessRaises(FilenameEncodingError,
168                               listdir_unicode,
169                               u'/dummy')
170         
171         # We're trying to list a directory whose name cannot be represented in
172         # the filesystem encoding.  This should fail.
173         mock_getfilesystemencoding.return_value = 'ascii'
174         _reload()
175         self.failUnlessRaises(FilenameEncodingError,
176                               listdir_unicode,
177                               u'/' + lumiere_nfc)
178
179
180 class EncodingUtil(ReallyEqualMixin):
181     def setUp(self):
182         self.original_platform = sys.platform
183         sys.platform = self.platform
184
185     def tearDown(self):
186         sys.platform = self.original_platform
187         _reload()
188
189     @patch('sys.stdout')
190     def test_argv_to_unicode(self, mock):
191         if 'argv' not in dir(self):
192             return
193
194         mock.encoding = self.output_encoding
195         argu = lumiere_nfc
196         argv = self.argv
197         _reload()
198         self.failUnlessReallyEqual(argv_to_unicode(argv), argu)
199
200     def test_unicode_to_url(self):
201         self.failUnless(unicode_to_url(lumiere_nfc), "lumi\xc3\xa8re")
202
203     @patch('sys.stdout')
204     def test_unicode_to_output(self, mock):
205         if 'argv' not in dir(self):
206             return
207
208         mock.encoding = self.output_encoding
209         _reload()
210         self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), self.argv)
211
212     def test_unicode_platform(self):
213         matrix = {
214           'linux2': False,
215           'openbsd4': False,
216           'win32':  True,
217           'darwin': True,
218         }
219
220         _reload()
221         self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform])
222  
223     @patch('sys.getfilesystemencoding')
224     @patch('os.listdir')
225     def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
226         if 'dirlist' not in dir(self):
227             return
228
229         try:
230             u"test".encode(self.filesystem_encoding)
231         except (LookupError, AttributeError):
232             raise unittest.SkipTest("This platform does not support the '%s' filesystem encoding "
233                                     "that we are testing for the benefit of a different platform."
234                                     % (self.filesystem_encoding,))
235
236         mock_listdir.return_value = self.dirlist
237         mock_getfilesystemencoding.return_value = self.filesystem_encoding
238
239         _reload()
240         filenames = listdir_unicode(u'/dummy')
241
242         self.failUnlessEqual(set([normalize(fname) for fname in filenames]),
243                              set(TEST_FILENAMES))
244
245
246 class StdlibUnicode(unittest.TestCase):
247     """This mainly tests that some of the stdlib functions support Unicode paths, but also that
248     listdir_unicode works for valid filenames."""
249
250     def skip_if_cannot_represent_filename(self, u):
251         enc = get_filesystem_encoding()
252         if not unicode_platform():
253             try:
254                 u.encode(enc)
255             except UnicodeEncodeError:
256                 raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")
257
258     def test_mkdir_open_exists_abspath_listdir_expanduser(self):
259         self.skip_if_cannot_represent_filename(lumiere_nfc)
260
261         try:
262             os.mkdir(lumiere_nfc)
263         except EnvironmentError, e:
264             raise unittest.SkipTest("%r\nIt is possible that the filesystem on which this test is being run "
265                                     "does not support Unicode, even though the platform does." % (e,))
266
267         fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt'
268         open(fn, 'wb').close()
269         self.failUnless(os.path.exists(fn))
270         self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn)))
271         filenames = listdir_unicode(lumiere_nfc)
272
273         # We only require that the listing includes a filename that is canonically equivalent
274         # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent).
275         self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames]))
276
277         expanded = os.path.expanduser("~/" + lumiere_nfc)
278         self.failIfIn("~", expanded)
279         self.failUnless(expanded.endswith(lumiere_nfc), expanded)
280
281     def test_open_unrepresentable(self):
282         if unicode_platform():
283             raise unittest.SkipTest("This test is not applicable to platforms that represent filenames as Unicode.")
284
285         enc = get_filesystem_encoding()
286         fn = u'\u2621.txt'
287         try:
288             fn.encode(enc)
289             raise unittest.SkipTest("This test cannot be run unless we know a filename that is not representable.")
290         except UnicodeEncodeError:
291             self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb')
292
293
294 class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
295     def tearDown(self):
296         _reload()
297
298     def _check(self, inp, out, enc, optional_quotes):
299         out2 = out
300         if optional_quotes:
301             out2 = out2[1:-1]
302         self.failUnlessReallyEqual(quote_output(inp, encoding=enc), out)
303         self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False), out2)
304         if out[0:2] == 'b"':
305             pass
306         elif isinstance(inp, str):
307             self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc), out)
308             self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False), out2)
309         else:
310             self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc), out)
311             self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False), out2)
312
313     def _test_quote_output_all(self, enc):
314         def check(inp, out, optional_quotes=False):
315             self._check(inp, out, enc, optional_quotes)
316
317         # optional single quotes
318         check("foo",  "'foo'",  True)
319         check("\\",   "'\\'",   True)
320         check("$\"`", "'$\"`'", True)
321
322         # mandatory single quotes
323         check("\"",   "'\"'")
324
325         # double quotes
326         check("'",    "\"'\"")
327         check("\n",   "\"\\x0a\"")
328         check("\x00", "\"\\x00\"")
329
330         # invalid Unicode and astral planes
331         check(u"\uFDD0\uFDEF",       "\"\\ufdd0\\ufdef\"")
332         check(u"\uDC00\uD800",       "\"\\udc00\\ud800\"")
333         check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
334         check(u"\uD800\uDC00",       "\"\\U00010000\"")
335         check(u"\uD800\uDC01",       "\"\\U00010001\"")
336         check(u"\uD801\uDC00",       "\"\\U00010400\"")
337         check(u"\uDBFF\uDFFF",       "\"\\U0010ffff\"")
338         check(u"'\uDBFF\uDFFF",      "\"'\\U0010ffff\"")
339         check(u"\"\uDBFF\uDFFF",     "\"\\\"\\U0010ffff\"")
340
341         # invalid UTF-8
342         check("\xFF",                "b\"\\xff\"")
343         check("\x00\"$\\`\x80\xFF",  "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
344
345     def test_quote_output_ascii(self, enc='ascii'):
346         def check(inp, out, optional_quotes=False):
347             self._check(inp, out, enc, optional_quotes)
348
349         self._test_quote_output_all(enc)
350         check(u"\u00D7",   "\"\\xd7\"")
351         check(u"'\u00D7",  "\"'\\xd7\"")
352         check(u"\"\u00D7", "\"\\\"\\xd7\"")
353         check(u"\u2621",   "\"\\u2621\"")
354         check(u"'\u2621",  "\"'\\u2621\"")
355         check(u"\"\u2621", "\"\\\"\\u2621\"")
356
357     def test_quote_output_latin1(self, enc='latin1'):
358         def check(inp, out, optional_quotes=False):
359             self._check(inp, out.encode('latin1'), enc, optional_quotes)
360
361         self._test_quote_output_all(enc)
362         check(u"\u00D7",   u"'\u00D7'", True)
363         check(u"'\u00D7",  u"\"'\u00D7\"")
364         check(u"\"\u00D7", u"'\"\u00D7'")
365         check(u"\u00D7\"", u"'\u00D7\"'", True)
366         check(u"\u2621",   u"\"\\u2621\"")
367         check(u"'\u2621",  u"\"'\\u2621\"")
368         check(u"\"\u2621", u"\"\\\"\\u2621\"")
369
370     def test_quote_output_utf8(self, enc='utf-8'):
371         def check(inp, out, optional_quotes=False):
372             self._check(inp, out.encode('utf-8'), enc, optional_quotes)
373
374         self._test_quote_output_all(enc)
375         check(u"\u2621",   u"'\u2621'", True)
376         check(u"'\u2621",  u"\"'\u2621\"")
377         check(u"\"\u2621", u"'\"\u2621'")
378         check(u"\u2621\"", u"'\u2621\"'", True)
379
380     def test_quote_output_default(self):
381         encodingutil.output_encoding = 'ascii'
382         self.test_quote_output_ascii(None)
383
384         encodingutil.output_encoding = 'latin1'
385         self.test_quote_output_latin1(None)
386
387         encodingutil.output_encoding = 'utf-8'
388         self.test_quote_output_utf8(None)
389
390
391 class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
392     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
393     argv = 'lumi\xc3\xa8re'
394     platform = 'linux2'
395     filesystem_encoding = 'UTF-8'
396     output_encoding = 'UTF-8'
397     argv_encoding = 'UTF-8'
398     dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
399
400 class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase):
401     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
402     argv = 'lumi\xe8re'
403     platform = 'linux2'
404     filesystem_encoding = 'ISO-8859-1'
405     output_encoding = 'ISO-8859-1'
406     argv_encoding = 'ISO-8859-1'
407     dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
408
409 class Windows(EncodingUtil, unittest.TestCase):
410     uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
411     argv = 'lumi\xc3\xa8re'
412     platform = 'win32'
413     filesystem_encoding = 'mbcs'
414     output_encoding = 'utf-8'
415     argv_encoding = 'utf-8'
416     dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
417
418 class MacOSXLeopard(EncodingUtil, unittest.TestCase):
419     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
420     output = 'lumi\xc3\xa8re'
421     platform = 'darwin'
422     filesystem_encoding = 'utf-8'
423     output_encoding = 'UTF-8'
424     argv_encoding = 'UTF-8'
425     dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
426
427 class MacOSXLeopard7bit(EncodingUtil, unittest.TestCase):
428     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
429     platform = 'darwin'
430     filesystem_encoding = 'utf-8'
431     output_encoding = 'US-ASCII'
432     argv_encoding = 'US-ASCII'
433     dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
434
435 class OpenBSD(EncodingUtil, unittest.TestCase):
436     uname = 'OpenBSD 4.1 GENERIC#187 i386 Intel(R) Celeron(R) CPU 2.80GHz ("GenuineIntel" 686-class)'
437     platform = 'openbsd4'
438     filesystem_encoding = '646'
439     output_encoding = '646'
440     argv_encoding = '646'
441     # Oops, I cannot write filenames containing non-ascii characters