]> git.rkrishnan.org Git - tahoe-lafs/tahoe-lafs.git/blob - src/allmydata/test/test_encodingutil.py
Only quote newline characters where necessary. fixes #1484
[tahoe-lafs/tahoe-lafs.git] / src / allmydata / test / test_encodingutil.py
1
2 lumiere_nfc = u"lumi\u00E8re"
3 Artonwall_nfc = u"\u00C4rtonwall.mp3"
4 Artonwall_nfd = u"A\u0308rtonwall.mp3"
5
6 TEST_FILENAMES = (
7   Artonwall_nfc,
8   u'test_file',
9   u'Blah blah.txt',
10 )
11
12 # The following main helps to generate a test class for other operating
13 # systems.
14
15 if __name__ == "__main__":
16     import sys, os
17     import tempfile
18     import shutil
19     import platform
20
21     if len(sys.argv) != 2:
22         print "Usage: %s lumi<e-grave>re" % sys.argv[0]
23         sys.exit(1)
24
25     if sys.platform == "win32":
26         try:
27             from allmydata.windows.fixups import initialize
28         except ImportError:
29             print "set PYTHONPATH to the src directory"
30             sys.exit(1)
31         initialize()
32
33     print
34     print "class MyWeirdOS(EncodingUtil, unittest.TestCase):"
35     print "    uname = '%s'" % ' '.join(platform.uname())
36     print "    argv = %s" % repr(sys.argv[1])
37     print "    platform = '%s'" % sys.platform
38     print "    filesystem_encoding = '%s'" % sys.getfilesystemencoding()
39     print "    io_encoding = '%s'" % sys.stdout.encoding
40     try:
41         tmpdir = tempfile.mkdtemp()
42         for fname in TEST_FILENAMES:
43             open(os.path.join(tmpdir, fname), 'w').close()
44
45         # Use Unicode API under Windows or MacOS X
46         if sys.platform in ('win32', 'darwin'):
47             dirlist = os.listdir(unicode(tmpdir))
48         else:
49             dirlist = os.listdir(tmpdir)
50
51         print "    dirlist = %s" % repr(dirlist)
52     except:
53         print "    # Oops, I cannot write filenames containing non-ascii characters"
54     print
55
56     shutil.rmtree(tmpdir)
57     sys.exit(0)
58
59 from twisted.trial import unittest
60 from mock import patch
61 import os, sys, locale
62
63 from allmydata.test.common_util import ReallyEqualMixin
64 from allmydata.util import encodingutil
65 from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
66     unicode_to_output, quote_output, unicode_platform, listdir_unicode, \
67     FilenameEncodingError, get_io_encoding, get_filesystem_encoding, _reload
68 from allmydata.dirnode import normalize
69
70 from twisted.python import usage
71
72 class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
73
74     @patch('sys.stdout')
75     def test_get_io_encoding(self, mock_stdout):
76         mock_stdout.encoding = 'UTF-8'
77         _reload()
78         self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
79
80         mock_stdout.encoding = 'cp65001'
81         _reload()
82         self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
83
84         mock_stdout.encoding = 'koi8-r'
85         expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
86         _reload()
87         self.failUnlessReallyEqual(get_io_encoding(), expected)
88
89         mock_stdout.encoding = 'nonexistent_encoding'
90         if sys.platform == "win32":
91             _reload()
92             self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
93         else:
94             self.failUnlessRaises(AssertionError, _reload)
95
96     @patch('locale.getpreferredencoding')
97     def test_get_io_encoding_not_from_stdout(self, mock_locale_getpreferredencoding):
98         locale  # hush pyflakes
99         mock_locale_getpreferredencoding.return_value = 'koi8-r'
100
101         class DummyStdout:
102             pass
103         old_stdout = sys.stdout
104         sys.stdout = DummyStdout()
105         try:
106             expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
107             _reload()
108             self.failUnlessReallyEqual(get_io_encoding(), expected)
109
110             sys.stdout.encoding = None
111             _reload()
112             self.failUnlessReallyEqual(get_io_encoding(), expected)
113
114             mock_locale_getpreferredencoding.return_value = None
115             _reload()
116             self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
117         finally:
118             sys.stdout = old_stdout
119
120     def test_argv_to_unicode(self):
121         encodingutil.io_encoding = 'utf-8'
122         self.failUnlessRaises(usage.UsageError,
123                               argv_to_unicode,
124                               lumiere_nfc.encode('latin1'))
125
126     def test_unicode_to_output(self):
127         encodingutil.io_encoding = 'koi8-r'
128         self.failUnlessRaises(UnicodeEncodeError, unicode_to_output, lumiere_nfc)
129
130     @patch('os.listdir')
131     def test_no_unicode_normalization(self, mock):
132         # Pretend to run on a Unicode platform.
133         # We normalized to NFC in 1.7beta, but we now don't.
134         orig_platform = sys.platform
135         try:
136             sys.platform = 'darwin'
137             mock.return_value = [Artonwall_nfd]
138             _reload()
139             self.failUnlessReallyEqual(listdir_unicode(u'/dummy'), [Artonwall_nfd])
140         finally:
141             sys.platform = orig_platform
142
143 # The following tests apply only to platforms that don't store filenames as
144 # Unicode entities on the filesystem.
145 class EncodingUtilNonUnicodePlatform(unittest.TestCase):
146     def setUp(self):
147         # Mock sys.platform because unicode_platform() uses it
148         self.original_platform = sys.platform
149         sys.platform = 'linux'
150
151     def tearDown(self):
152         sys.platform = self.original_platform
153         _reload()
154
155     @patch('sys.getfilesystemencoding')
156     @patch('os.listdir')
157     def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
158         # What happens if latin1-encoded filenames are encountered on an UTF-8
159         # filesystem?
160         mock_listdir.return_value = [
161             lumiere_nfc.encode('utf-8'),
162             lumiere_nfc.encode('latin1')]
163
164         mock_getfilesystemencoding.return_value = 'utf-8'
165         _reload()
166         self.failUnlessRaises(FilenameEncodingError,
167                               listdir_unicode,
168                               u'/dummy')
169
170         # We're trying to list a directory whose name cannot be represented in
171         # the filesystem encoding.  This should fail.
172         mock_getfilesystemencoding.return_value = 'ascii'
173         _reload()
174         self.failUnlessRaises(FilenameEncodingError,
175                               listdir_unicode,
176                               u'/' + lumiere_nfc)
177
178
179 class EncodingUtil(ReallyEqualMixin):
180     def setUp(self):
181         self.original_platform = sys.platform
182         sys.platform = self.platform
183
184     def tearDown(self):
185         sys.platform = self.original_platform
186         _reload()
187
188     @patch('sys.stdout')
189     def test_argv_to_unicode(self, mock):
190         if 'argv' not in dir(self):
191             return
192
193         mock.encoding = self.io_encoding
194         argu = lumiere_nfc
195         argv = self.argv
196         _reload()
197         self.failUnlessReallyEqual(argv_to_unicode(argv), argu)
198
199     def test_unicode_to_url(self):
200         self.failUnless(unicode_to_url(lumiere_nfc), "lumi\xc3\xa8re")
201
202     @patch('sys.stdout')
203     def test_unicode_to_output(self, mock):
204         if 'argv' not in dir(self):
205             return
206
207         mock.encoding = self.io_encoding
208         _reload()
209         self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), self.argv)
210
211     def test_unicode_platform(self):
212         matrix = {
213           'linux2': False,
214           'linux3': False,
215           'openbsd4': False,
216           'win32':  True,
217           'darwin': True,
218         }
219
220         _reload()
221         self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform])
222
223     @patch('sys.getfilesystemencoding')
224     @patch('os.listdir')
225     def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
226         if 'dirlist' not in dir(self):
227             return
228
229         try:
230             u"test".encode(self.filesystem_encoding)
231         except (LookupError, AttributeError):
232             raise unittest.SkipTest("This platform does not support the '%s' filesystem encoding "
233                                     "that we are testing for the benefit of a different platform."
234                                     % (self.filesystem_encoding,))
235
236         mock_listdir.return_value = self.dirlist
237         mock_getfilesystemencoding.return_value = self.filesystem_encoding
238
239         _reload()
240         filenames = listdir_unicode(u'/dummy')
241
242         self.failUnlessEqual(set([normalize(fname) for fname in filenames]),
243                              set(TEST_FILENAMES))
244
245
246 class StdlibUnicode(unittest.TestCase):
247     """This mainly tests that some of the stdlib functions support Unicode paths, but also that
248     listdir_unicode works for valid filenames."""
249
250     def skip_if_cannot_represent_filename(self, u):
251         enc = get_filesystem_encoding()
252         if not unicode_platform():
253             try:
254                 u.encode(enc)
255             except UnicodeEncodeError:
256                 raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")
257
258     def test_mkdir_open_exists_abspath_listdir_expanduser(self):
259         self.skip_if_cannot_represent_filename(lumiere_nfc)
260
261         try:
262             os.mkdir(lumiere_nfc)
263         except EnvironmentError, e:
264             raise unittest.SkipTest("%r\nIt is possible that the filesystem on which this test is being run "
265                                     "does not support Unicode, even though the platform does." % (e,))
266
267         fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt'
268         open(fn, 'wb').close()
269         self.failUnless(os.path.exists(fn))
270         self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn)))
271         filenames = listdir_unicode(lumiere_nfc)
272
273         # We only require that the listing includes a filename that is canonically equivalent
274         # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent).
275         self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames]))
276
277         expanded = os.path.expanduser("~/" + lumiere_nfc)
278         self.failIfIn("~", expanded)
279         self.failUnless(expanded.endswith(lumiere_nfc), expanded)
280
281     def test_open_unrepresentable(self):
282         if unicode_platform():
283             raise unittest.SkipTest("This test is not applicable to platforms that represent filenames as Unicode.")
284
285         enc = get_filesystem_encoding()
286         fn = u'\u2621.txt'
287         try:
288             fn.encode(enc)
289             raise unittest.SkipTest("This test cannot be run unless we know a filename that is not representable.")
290         except UnicodeEncodeError:
291             self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb')
292
293
294 class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
295     def tearDown(self):
296         _reload()
297
298     def _check(self, inp, out, enc, optional_quotes, quote_newlines):
299         out2 = out
300         if optional_quotes:
301             out2 = out2[1:-1]
302         self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quote_newlines=quote_newlines), out)
303         self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
304         if out[0:2] == 'b"':
305             pass
306         elif isinstance(inp, str):
307             self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quote_newlines=quote_newlines), out)
308             self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
309         else:
310             self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quote_newlines=quote_newlines), out)
311             self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
312
313     def _test_quote_output_all(self, enc):
314         def check(inp, out, optional_quotes=False, quote_newlines=None):
315             self._check(inp, out, enc, optional_quotes, quote_newlines)
316
317         # optional single quotes
318         check("foo",  "'foo'",  True)
319         check("\\",   "'\\'",   True)
320         check("$\"`", "'$\"`'", True)
321         check("\n",   "'\n'",   True, quote_newlines=False)
322
323         # mandatory single quotes
324         check("\"",   "'\"'")
325
326         # double quotes
327         check("'",    "\"'\"")
328         check("\n",   "\"\\x0a\"", quote_newlines=True)
329         check("\x00", "\"\\x00\"")
330
331         # invalid Unicode and astral planes
332         check(u"\uFDD0\uFDEF",       "\"\\ufdd0\\ufdef\"")
333         check(u"\uDC00\uD800",       "\"\\udc00\\ud800\"")
334         check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
335         check(u"\uD800\uDC00",       "\"\\U00010000\"")
336         check(u"\uD800\uDC01",       "\"\\U00010001\"")
337         check(u"\uD801\uDC00",       "\"\\U00010400\"")
338         check(u"\uDBFF\uDFFF",       "\"\\U0010ffff\"")
339         check(u"'\uDBFF\uDFFF",      "\"'\\U0010ffff\"")
340         check(u"\"\uDBFF\uDFFF",     "\"\\\"\\U0010ffff\"")
341
342         # invalid UTF-8
343         check("\xFF",                "b\"\\xff\"")
344         check("\x00\"$\\`\x80\xFF",  "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
345
346     def test_quote_output_ascii(self, enc='ascii'):
347         def check(inp, out, optional_quotes=False, quote_newlines=None):
348             self._check(inp, out, enc, optional_quotes, quote_newlines)
349
350         self._test_quote_output_all(enc)
351         check(u"\u00D7",   "\"\\xd7\"")
352         check(u"'\u00D7",  "\"'\\xd7\"")
353         check(u"\"\u00D7", "\"\\\"\\xd7\"")
354         check(u"\u2621",   "\"\\u2621\"")
355         check(u"'\u2621",  "\"'\\u2621\"")
356         check(u"\"\u2621", "\"\\\"\\u2621\"")
357         check(u"\n",       "'\n'",      True, quote_newlines=False)
358         check(u"\n",       "\"\\x0a\"", quote_newlines=True)
359
360     def test_quote_output_latin1(self, enc='latin1'):
361         def check(inp, out, optional_quotes=False, quote_newlines=None):
362             self._check(inp, out.encode('latin1'), enc, optional_quotes, quote_newlines)
363
364         self._test_quote_output_all(enc)
365         check(u"\u00D7",   u"'\u00D7'", True)
366         check(u"'\u00D7",  u"\"'\u00D7\"")
367         check(u"\"\u00D7", u"'\"\u00D7'")
368         check(u"\u00D7\"", u"'\u00D7\"'", True)
369         check(u"\u2621",   u"\"\\u2621\"")
370         check(u"'\u2621",  u"\"'\\u2621\"")
371         check(u"\"\u2621", u"\"\\\"\\u2621\"")
372         check(u"\n",       u"'\n'", True, quote_newlines=False)
373         check(u"\n",       u"\"\\x0a\"", quote_newlines=True)
374
375     def test_quote_output_utf8(self, enc='utf-8'):
376         def check(inp, out, optional_quotes=False, quote_newlines=None):
377             self._check(inp, out.encode('utf-8'), enc, optional_quotes, quote_newlines)
378
379         self._test_quote_output_all(enc)
380         check(u"\u2621",   u"'\u2621'", True)
381         check(u"'\u2621",  u"\"'\u2621\"")
382         check(u"\"\u2621", u"'\"\u2621'")
383         check(u"\u2621\"", u"'\u2621\"'", True)
384         check(u"\n",       u"'\n'", True, quote_newlines=False)
385         check(u"\n",       u"\"\\x0a\"", quote_newlines=True)
386
387     def test_quote_output_default(self):
388         encodingutil.io_encoding = 'ascii'
389         self.test_quote_output_ascii(None)
390
391         encodingutil.io_encoding = 'latin1'
392         self.test_quote_output_latin1(None)
393
394         encodingutil.io_encoding = 'utf-8'
395         self.test_quote_output_utf8(None)
396
397
398 class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
399     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
400     argv = 'lumi\xc3\xa8re'
401     platform = 'linux2'
402     filesystem_encoding = 'UTF-8'
403     io_encoding = 'UTF-8'
404     dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
405
406 class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase):
407     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
408     argv = 'lumi\xe8re'
409     platform = 'linux2'
410     filesystem_encoding = 'ISO-8859-1'
411     io_encoding = 'ISO-8859-1'
412     dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
413
414 class Windows(EncodingUtil, unittest.TestCase):
415     uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
416     argv = 'lumi\xc3\xa8re'
417     platform = 'win32'
418     filesystem_encoding = 'mbcs'
419     io_encoding = 'utf-8'
420     dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
421
422 class MacOSXLeopard(EncodingUtil, unittest.TestCase):
423     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
424     output = 'lumi\xc3\xa8re'
425     platform = 'darwin'
426     filesystem_encoding = 'utf-8'
427     io_encoding = 'UTF-8'
428     dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
429
430 class MacOSXLeopard7bit(EncodingUtil, unittest.TestCase):
431     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
432     platform = 'darwin'
433     filesystem_encoding = 'utf-8'
434     io_encoding = 'US-ASCII'
435     dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
436
437 class OpenBSD(EncodingUtil, unittest.TestCase):
438     uname = 'OpenBSD 4.1 GENERIC#187 i386 Intel(R) Celeron(R) CPU 2.80GHz ("GenuineIntel" 686-class)'
439     platform = 'openbsd4'
440     filesystem_encoding = '646'
441     io_encoding = '646'
442     # Oops, I cannot write filenames containing non-ascii characters