]> git.rkrishnan.org Git - tahoe-lafs/tahoe-lafs.git/blob - src/allmydata/test/test_encodingutil.py
util.encodingutil: change quote_output to do less unnecessary escaping, and to use...
[tahoe-lafs/tahoe-lafs.git] / src / allmydata / test / test_encodingutil.py
1
2 lumiere_nfc = u"lumi\u00E8re"
3 Artonwall_nfc = u"\u00C4rtonwall.mp3"
4 Artonwall_nfd = u"A\u0308rtonwall.mp3"
5
6 TEST_FILENAMES = (
7   Artonwall_nfc,
8   u'test_file',
9   u'Blah blah.txt',
10 )
11
12 # The following main helps to generate a test class for other operating
13 # systems.
14
15 if __name__ == "__main__":
16     import sys, os
17     import tempfile
18     import shutil
19     import platform
20
21     if len(sys.argv) != 2:
22         print "Usage: %s lumi<e-grave>re" % sys.argv[0]
23         sys.exit(1)
24     
25     print
26     print "class MyWeirdOS(EncodingUtil, unittest.TestCase):"
27     print "    uname = '%s'" % ' '.join(platform.uname())
28     if sys.platform != "win32":
29         print "    argv = %s" % repr(sys.argv[1])
30     print "    platform = '%s'" % sys.platform
31     print "    filesystem_encoding = '%s'" % sys.getfilesystemencoding()
32     print "    output_encoding = '%s'" % sys.stdout.encoding
33     print "    argv_encoding = '%s'" % (sys.platform == "win32" and 'ascii' or sys.stdout.encoding)
34
35     try:
36         tmpdir = tempfile.mkdtemp()
37         for fname in TEST_FILENAMES:
38             open(os.path.join(tmpdir, fname), 'w').close() 
39
40         # Use Unicode API under Windows or MacOS X
41         if sys.platform in ('win32', 'darwin'):
42             dirlist = os.listdir(unicode(tmpdir))
43         else:
44             dirlist = os.listdir(tmpdir)
45
46         print "    dirlist = %s" % repr(dirlist)
47     except:
48         print "    # Oops, I cannot write filenames containing non-ascii characters"
49     print
50
51     shutil.rmtree(tmpdir)
52     sys.exit(0)
53
54 from twisted.trial import unittest
55 from mock import patch
56 import os, sys, locale
57
58 from allmydata.test.common_util import ReallyEqualMixin
59 from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
60     unicode_to_output, quote_output, unicode_platform, listdir_unicode, \
61     FilenameEncodingError, get_output_encoding, get_filesystem_encoding, _reload
62 from allmydata.dirnode import normalize
63
64 from twisted.python import usage
65
66 class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
67     def tearDown(self):
68         _reload()
69
70     @patch('sys.stdout')
71     def test_get_output_encoding(self, mock_stdout):
72         mock_stdout.encoding = 'UTF-8'
73         _reload()
74         self.failUnlessReallyEqual(get_output_encoding(), 'utf-8')
75
76         mock_stdout.encoding = 'cp65001'
77         _reload()
78         self.failUnlessReallyEqual(get_output_encoding(), 'utf-8')
79
80         mock_stdout.encoding = 'koi8-r'
81         _reload()
82         self.failUnlessReallyEqual(get_output_encoding(), 'koi8-r')
83
84         mock_stdout.encoding = 'nonexistent_encoding'
85         self.failUnlessRaises(AssertionError, _reload)
86
87     @patch('locale.getpreferredencoding')
88     def test_get_output_encoding_not_from_stdout(self, mock_locale_getpreferredencoding):
89         locale  # hush pyflakes
90         mock_locale_getpreferredencoding.return_value = 'koi8-r'
91
92         class DummyStdout:
93             pass
94         old_stdout = sys.stdout
95         sys.stdout = DummyStdout()
96         try:
97             _reload()
98             self.failUnlessReallyEqual(get_output_encoding(), 'koi8-r')
99
100             sys.stdout.encoding = None
101             _reload()
102             self.failUnlessReallyEqual(get_output_encoding(), 'koi8-r')
103
104             mock_locale_getpreferredencoding.return_value = None
105             _reload()
106             self.failUnlessReallyEqual(get_output_encoding(), 'utf-8')
107         finally:
108             sys.stdout = old_stdout
109
110     @patch('sys.stdout')
111     def test_argv_to_unicode(self, mock):
112         mock.encoding = 'utf-8'
113         _reload()
114
115         self.failUnlessRaises(usage.UsageError,
116                               argv_to_unicode,
117                               lumiere_nfc.encode('latin1'))
118
119     @patch('sys.stdout')
120     def test_unicode_to_output(self, mock):
121         # Encoding koi8-r cannot represent e-grave
122         mock.encoding = 'koi8-r'
123         _reload()
124         self.failUnlessRaises(UnicodeEncodeError, unicode_to_output, lumiere_nfc)
125
126     @patch('os.listdir')
127     def test_no_unicode_normalization(self, mock):
128         # Pretend to run on a Unicode platform.
129         # We normalized to NFC in 1.7beta, but we now don't.
130         orig_platform = sys.platform
131         try:
132             sys.platform = 'darwin'
133             mock.return_value = [Artonwall_nfd]
134             _reload()
135             self.failUnlessReallyEqual(listdir_unicode(u'/dummy'), [Artonwall_nfd])
136         finally:
137             sys.platform = orig_platform
138
139 # The following tests apply only to platforms that don't store filenames as
140 # Unicode entities on the filesystem.
141 class EncodingUtilNonUnicodePlatform(unittest.TestCase):
142     def setUp(self):
143         # Mock sys.platform because unicode_platform() uses it
144         self.original_platform = sys.platform
145         sys.platform = 'linux'
146
147     def tearDown(self):
148         sys.platform = self.original_platform
149         _reload()
150
151     @patch('sys.getfilesystemencoding')
152     @patch('os.listdir')
153     def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
154         # What happens if latin1-encoded filenames are encountered on an UTF-8
155         # filesystem?
156         mock_listdir.return_value = [
157             lumiere_nfc.encode('utf-8'),
158             lumiere_nfc.encode('latin1')]
159
160         mock_getfilesystemencoding.return_value = 'utf-8'
161         _reload()
162         self.failUnlessRaises(FilenameEncodingError,
163                               listdir_unicode,
164                               u'/dummy')
165         
166         # We're trying to list a directory whose name cannot be represented in
167         # the filesystem encoding.  This should fail.
168         mock_getfilesystemencoding.return_value = 'ascii'
169         _reload()
170         self.failUnlessRaises(FilenameEncodingError,
171                               listdir_unicode,
172                               u'/' + lumiere_nfc)
173
174 class EncodingUtil(ReallyEqualMixin):
175     def setUp(self):
176         # Mock sys.platform because unicode_platform() uses it
177         self.original_platform = sys.platform
178         sys.platform = self.platform
179
180     def tearDown(self):
181         sys.platform = self.original_platform
182         _reload()
183
184     @patch('sys.stdout')
185     def test_argv_to_unicode(self, mock):
186         if 'argv' not in dir(self):
187             return
188
189         mock.encoding = self.output_encoding
190         argu = lumiere_nfc
191         argv = self.argv
192         _reload()
193         self.failUnlessReallyEqual(argv_to_unicode(argv), argu)
194
195     def test_unicode_to_url(self):
196         self.failUnless(unicode_to_url(lumiere_nfc), "lumi\xc3\xa8re")
197
198     @patch('sys.stdout')
199     def test_unicode_to_output(self, mock):
200         if 'output' not in dir(self):
201             return
202
203         mock.encoding = self.output_encoding
204         _reload()
205         self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), self.output)
206
207     def test_unicode_platform(self):
208         matrix = {
209           'linux2': False,
210           'openbsd4': False,
211           'win32':  True,
212           'darwin': True,
213         }
214
215         _reload()
216         self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform])
217  
218     @patch('sys.getfilesystemencoding')
219     @patch('os.listdir')
220     def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
221         if 'dirlist' not in dir(self):
222             return
223
224         try:
225             u"test".encode(self.filesystem_encoding)
226         except (LookupError, AttributeError):
227             raise unittest.SkipTest("This platform does not support the '%s' filesystem encoding "
228                                     "that we are testing for the benefit of a different platform."
229                                     % (self.filesystem_encoding,))
230
231         mock_listdir.return_value = self.dirlist
232         mock_getfilesystemencoding.return_value = self.filesystem_encoding
233
234         _reload()
235         filenames = listdir_unicode(u'/dummy')
236
237         self.failUnlessEqual(set([normalize(fname) for fname in filenames]),
238                              set(TEST_FILENAMES))
239
240
241 class StdlibUnicode(unittest.TestCase):
242     """This mainly tests that some of the stdlib functions support Unicode paths, but also that
243     listdir_unicode works for valid filenames."""
244
245     def skip_if_cannot_represent_filename(self, u):
246         enc = get_filesystem_encoding()
247         if not unicode_platform():
248             try:
249                 u.encode(enc)
250             except UnicodeEncodeError:
251                 raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")
252
253     def test_mkdir_open_exists_abspath_listdir_expanduser(self):
254         self.skip_if_cannot_represent_filename(lumiere_nfc)
255
256         try:
257             os.mkdir(lumiere_nfc)
258         except EnvironmentError, e:
259             raise unittest.SkipTest("%r\nIt is possible that the filesystem on which this test is being run "
260                                     "does not support Unicode, even though the platform does." % (e,))
261
262         fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt'
263         open(fn, 'wb').close()
264         self.failUnless(os.path.exists(fn))
265         self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn)))
266         filenames = listdir_unicode(lumiere_nfc)
267
268         # We only require that the listing includes a filename that is canonically equivalent
269         # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent).
270         self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames]))
271
272         expanded = os.path.expanduser("~/" + lumiere_nfc)
273         self.failIfIn("~", expanded)
274         self.failUnless(expanded.endswith(lumiere_nfc), expanded)
275
276     def test_open_unrepresentable(self):
277         if unicode_platform():
278             raise unittest.SkipTest("This test is not applicable to platforms that represent filenames as Unicode.")
279
280         enc = get_filesystem_encoding()
281         fn = u'\u2621.txt'
282         try:
283             fn.encode(enc)
284             raise unittest.SkipTest("This test cannot be run unless we know a filename that is not representable.")
285         except UnicodeEncodeError:
286             self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb')
287
288
289 class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
290     def _check(self, inp, out, enc, optional_quotes):
291         out2 = out
292         if optional_quotes:
293             out2 = out2[1:-1]
294         self.failUnlessReallyEqual(quote_output(inp, encoding=enc), out)
295         self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False), out2)
296         if out[0:2] != 'b"':
297             if isinstance(inp, str):
298                 self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc), out)
299                 self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False), out2)
300             else:
301                 self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc), out)
302                 self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False), out2)
303
304     def _test_quote_output_all(self, enc):
305         def check(inp, out, optional_quotes=False):
306             self._check(inp, out, enc, optional_quotes)
307
308         # optional single quotes
309         check("foo",  "'foo'",  True)
310         check("\\",   "'\\'",   True)
311         check("$\"`", "'$\"`'", True)
312
313         # mandatory single quotes
314         check("\"",   "'\"'")
315
316         # double quotes
317         check("'",    "\"'\"")
318         check("\n",   "\"\\x0a\"")
319         check("\x00", "\"\\x00\"")
320
321         # invalid Unicode and astral planes
322         check(u"\uFDD0\uFDEF",       "\"\\ufdd0\\ufdef\"")
323         check(u"\uDC00\uD800",       "\"\\udc00\\ud800\"")
324         check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
325         check(u"\uD800\uDC00",       "\"\\U00010000\"")
326         check(u"\uD800\uDC01",       "\"\\U00010001\"")
327         check(u"\uD801\uDC00",       "\"\\U00010400\"")
328         check(u"\uDBFF\uDFFF",       "\"\\U0010ffff\"")
329         check(u"'\uDBFF\uDFFF",      "\"'\\U0010ffff\"")
330         check(u"\"\uDBFF\uDFFF",     "\"\\\"\\U0010ffff\"")
331
332         # invalid UTF-8
333         check("\xFF",                "b\"\\xff\"")
334         check("\x00\"$\\`\x80\xFF",  "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
335
336     def test_quote_output_ascii(self, enc='ascii'):
337         def check(inp, out, optional_quotes=False):
338             self._check(inp, out, enc, optional_quotes)
339
340         self._test_quote_output_all(enc)
341         check(u"\u00D7",   "\"\\xd7\"")
342         check(u"'\u00D7",  "\"'\\xd7\"")
343         check(u"\"\u00D7", "\"\\\"\\xd7\"")
344         check(u"\u2621",   "\"\\u2621\"")
345         check(u"'\u2621",  "\"'\\u2621\"")
346         check(u"\"\u2621", "\"\\\"\\u2621\"")
347
348     def test_quote_output_latin1(self, enc='latin1'):
349         def check(inp, out, optional_quotes=False):
350             self._check(inp, out.encode('latin1'), enc, optional_quotes)
351
352         self._test_quote_output_all(enc)
353         check(u"\u00D7",   u"'\u00D7'", True)
354         check(u"'\u00D7",  u"\"'\u00D7\"")
355         check(u"\"\u00D7", u"'\"\u00D7'")
356         check(u"\u00D7\"", u"'\u00D7\"'", True)
357         check(u"\u2621",   u"\"\\u2621\"")
358         check(u"'\u2621",  u"\"'\\u2621\"")
359         check(u"\"\u2621", u"\"\\\"\\u2621\"")
360
361     def test_quote_output_utf8(self, enc='utf-8'):
362         def check(inp, out, optional_quotes=False):
363             self._check(inp, out.encode('utf-8'), enc, optional_quotes)
364
365         self._test_quote_output_all(enc)
366         check(u"\u2621",   u"'\u2621'", True)
367         check(u"'\u2621",  u"\"'\u2621\"")
368         check(u"\"\u2621", u"'\"\u2621'")
369         check(u"\u2621\"", u"'\u2621\"'", True)
370
371     @patch('sys.stdout')
372     def test_quote_output_mock(self, mock_stdout):
373         mock_stdout.encoding = 'ascii'
374         _reload()
375         self.test_quote_output_ascii(None)
376
377         mock_stdout.encoding = 'latin1'
378         _reload()
379         self.test_quote_output_latin1(None)
380
381         mock_stdout.encoding = 'utf-8'
382         _reload()
383         self.test_quote_output_utf8(None)
384
385
386 class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
387     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
388     output = 'lumi\xc3\xa8re'
389     argv = 'lumi\xc3\xa8re'
390     platform = 'linux2'
391     filesystem_encoding = 'UTF-8'
392     output_encoding = 'UTF-8'
393     argv_encoding = 'UTF-8'
394     dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
395
396 class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase):
397     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
398     output = 'lumi\xe8re'
399     argv = 'lumi\xe8re'
400     platform = 'linux2'
401     filesystem_encoding = 'ISO-8859-1'
402     output_encoding = 'ISO-8859-1'
403     argv_encoding = 'ISO-8859-1'
404     dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
405
406 class WindowsXP(EncodingUtil, unittest.TestCase):
407     uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
408     output = 'lumi\x8are'
409     platform = 'win32'
410     filesystem_encoding = 'mbcs'
411     output_encoding = 'cp850'
412     argv_encoding = 'ascii'
413     dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
414
415 class WindowsXP_UTF8(EncodingUtil, unittest.TestCase):
416     uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
417     output = 'lumi\xc3\xa8re'
418     platform = 'win32'
419     filesystem_encoding = 'mbcs'
420     output_encoding = 'cp65001'
421     argv_encoding = 'ascii'
422     dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
423
424 class WindowsVista(EncodingUtil, unittest.TestCase):
425     uname = 'Windows Vista 6.0.6000 x86 x86 Family 6 Model 15 Stepping 11, GenuineIntel'
426     output = 'lumi\x8are'
427     platform = 'win32'
428     filesystem_encoding = 'mbcs'
429     output_encoding = 'cp850'
430     argv_encoding = 'ascii'
431     dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
432
433 class MacOSXLeopard(EncodingUtil, unittest.TestCase):
434     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
435     output = 'lumi\xc3\xa8re'
436     argv = 'lumi\xc3\xa8re'
437     platform = 'darwin'
438     filesystem_encoding = 'utf-8'
439     output_encoding = 'UTF-8'
440     argv_encoding = 'UTF-8'
441     dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
442
443 class MacOSXLeopard7bit(EncodingUtil, unittest.TestCase):
444     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
445     platform = 'darwin'
446     filesystem_encoding = 'utf-8'
447     output_encoding = 'US-ASCII'
448     argv_encoding = 'US-ASCII'
449     dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
450
451 class OpenBSD(EncodingUtil, unittest.TestCase):
452     uname = 'OpenBSD 4.1 GENERIC#187 i386 Intel(R) Celeron(R) CPU 2.80GHz ("GenuineIntel" 686-class)'
453     platform = 'openbsd4'
454     filesystem_encoding = '646'
455     output_encoding = '646'
456     argv_encoding = '646'
457     # Oops, I cannot write filenames containing non-ascii characters