]> git.rkrishnan.org Git - tahoe-lafs/tahoe-lafs.git/blob - src/allmydata/test/test_encodingutil.py
Eliminate mock dependency.
[tahoe-lafs/tahoe-lafs.git] / src / allmydata / test / test_encodingutil.py
1
2 lumiere_nfc = u"lumi\u00E8re"
3 Artonwall_nfc = u"\u00C4rtonwall.mp3"
4 Artonwall_nfd = u"A\u0308rtonwall.mp3"
5
6 TEST_FILENAMES = (
7   Artonwall_nfc,
8   u'test_file',
9   u'Blah blah.txt',
10 )
11
12 # The following main helps to generate a test class for other operating
13 # systems.
14
15 if __name__ == "__main__":
16     import sys, os
17     import tempfile
18     import shutil
19     import platform
20
21     if len(sys.argv) != 2:
22         print "Usage: %s lumi<e-grave>re" % sys.argv[0]
23         sys.exit(1)
24
25     if sys.platform == "win32":
26         try:
27             from allmydata.windows.fixups import initialize
28         except ImportError:
29             print "set PYTHONPATH to the src directory"
30             sys.exit(1)
31         initialize()
32
33     print
34     print "class MyWeirdOS(EncodingUtil, unittest.TestCase):"
35     print "    uname = '%s'" % ' '.join(platform.uname())
36     print "    argv = %s" % repr(sys.argv[1])
37     print "    platform = '%s'" % sys.platform
38     print "    filesystem_encoding = '%s'" % sys.getfilesystemencoding()
39     print "    io_encoding = '%s'" % sys.stdout.encoding
40     try:
41         tmpdir = tempfile.mkdtemp()
42         for fname in TEST_FILENAMES:
43             open(os.path.join(tmpdir, fname), 'w').close()
44
45         # Use Unicode API under Windows or MacOS X
46         if sys.platform in ('win32', 'darwin'):
47             dirlist = os.listdir(unicode(tmpdir))
48         else:
49             dirlist = os.listdir(tmpdir)
50
51         print "    dirlist = %s" % repr(dirlist)
52     except:
53         print "    # Oops, I cannot write filenames containing non-ascii characters"
54     print
55
56     shutil.rmtree(tmpdir)
57     sys.exit(0)
58
59
60 import os, sys, locale
61
62 from twisted.trial import unittest
63
64 from allmydata.test.common_util import ReallyEqualMixin
65 from allmydata.util import encodingutil, fileutil
66 from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
67     unicode_to_output, quote_output, quote_path, quote_local_unicode_path, \
68     unicode_platform, listdir_unicode, FilenameEncodingError, get_io_encoding, \
69     get_filesystem_encoding, to_str, from_utf8_or_none, _reload
70 from allmydata.dirnode import normalize
71
72 from twisted.python import usage
73
74
75 class MockStdout(object):
76     pass
77
78 class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
79     def test_get_io_encoding(self):
80         mock_stdout = MockStdout()
81         self.patch(sys, 'stdout', mock_stdout)
82
83         mock_stdout.encoding = 'UTF-8'
84         _reload()
85         self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
86
87         mock_stdout.encoding = 'cp65001'
88         _reload()
89         self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
90
91         mock_stdout.encoding = 'koi8-r'
92         expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
93         _reload()
94         self.failUnlessReallyEqual(get_io_encoding(), expected)
95
96         mock_stdout.encoding = 'nonexistent_encoding'
97         if sys.platform == "win32":
98             _reload()
99             self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
100         else:
101             self.failUnlessRaises(AssertionError, _reload)
102
103     def test_get_io_encoding_not_from_stdout(self):
104         preferredencoding = 'koi8-r'
105         def call_locale_getpreferredencoding():
106             return preferredencoding
107         self.patch(locale, 'getpreferredencoding', call_locale_getpreferredencoding)
108         mock_stdout = MockStdout()
109         self.patch(sys, 'stdout', mock_stdout)
110
111         expected = sys.platform == "win32" and 'utf-8' or 'koi8-r'
112         _reload()
113         self.failUnlessReallyEqual(get_io_encoding(), expected)
114
115         mock_stdout.encoding = None
116         _reload()
117         self.failUnlessReallyEqual(get_io_encoding(), expected)
118
119         preferredencoding = None
120         _reload()
121         self.failUnlessReallyEqual(get_io_encoding(), 'utf-8')
122
123     def test_argv_to_unicode(self):
124         encodingutil.io_encoding = 'utf-8'
125         self.failUnlessRaises(usage.UsageError,
126                               argv_to_unicode,
127                               lumiere_nfc.encode('latin1'))
128
129     def test_unicode_to_output(self):
130         encodingutil.io_encoding = 'koi8-r'
131         self.failUnlessRaises(UnicodeEncodeError, unicode_to_output, lumiere_nfc)
132
133     def test_no_unicode_normalization(self):
134         # Pretend to run on a Unicode platform.
135         # listdir_unicode normalized to NFC in 1.7beta, but now doesn't.
136
137         def call_os_listdir(path):
138             return [Artonwall_nfd]
139         self.patch(os, 'listdir', call_os_listdir)
140         self.patch(sys, 'platform', 'darwin')
141
142         _reload()
143         self.failUnlessReallyEqual(listdir_unicode(u'/dummy'), [Artonwall_nfd])
144
145
146 # The following tests apply only to platforms that don't store filenames as
147 # Unicode entities on the filesystem.
148 class EncodingUtilNonUnicodePlatform(unittest.TestCase):
149     def setUp(self):
150         # Mock sys.platform because unicode_platform() uses it
151         self.original_platform = sys.platform
152         sys.platform = 'linux'
153
154     def tearDown(self):
155         sys.platform = self.original_platform
156         _reload()
157
158     def test_listdir_unicode(self):
159         # What happens if latin1-encoded filenames are encountered on an UTF-8
160         # filesystem?
161         def call_os_listdir(path):
162             return [
163               lumiere_nfc.encode('utf-8'),
164               lumiere_nfc.encode('latin1')
165             ]
166         self.patch(os, 'listdir', call_os_listdir)
167
168         sys_filesystemencoding = 'utf-8'
169         def call_sys_getfilesystemencoding():
170             return sys_filesystemencoding
171         self.patch(sys, 'getfilesystemencoding', call_sys_getfilesystemencoding)
172
173         _reload()
174         self.failUnlessRaises(FilenameEncodingError,
175                               listdir_unicode,
176                               u'/dummy')
177
178         # We're trying to list a directory whose name cannot be represented in
179         # the filesystem encoding.  This should fail.
180         sys_filesystemencoding = 'ascii'
181         _reload()
182         self.failUnlessRaises(FilenameEncodingError,
183                               listdir_unicode,
184                               u'/' + lumiere_nfc)
185
186
187 class EncodingUtil(ReallyEqualMixin):
188     def setUp(self):
189         self.original_platform = sys.platform
190         sys.platform = self.platform
191
192     def tearDown(self):
193         sys.platform = self.original_platform
194         _reload()
195
196     def test_argv_to_unicode(self):
197         if 'argv' not in dir(self):
198             return
199
200         mock_stdout = MockStdout()
201         mock_stdout.encoding = self.io_encoding
202         self.patch(sys, 'stdout', mock_stdout)
203
204         argu = lumiere_nfc
205         argv = self.argv
206         _reload()
207         self.failUnlessReallyEqual(argv_to_unicode(argv), argu)
208
209     def test_unicode_to_url(self):
210         self.failUnless(unicode_to_url(lumiere_nfc), "lumi\xc3\xa8re")
211
212     def test_unicode_to_output(self):
213         if 'argv' not in dir(self):
214             return
215
216         mock_stdout = MockStdout()
217         mock_stdout.encoding = self.io_encoding
218         self.patch(sys, 'stdout', mock_stdout)
219
220         _reload()
221         self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), self.argv)
222
223     def test_unicode_platform(self):
224         matrix = {
225           'linux2': False,
226           'linux3': False,
227           'openbsd4': False,
228           'win32':  True,
229           'darwin': True,
230         }
231
232         _reload()
233         self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform])
234
235     def test_listdir_unicode(self):
236         if 'dirlist' not in dir(self):
237             return
238
239         try:
240             u"test".encode(self.filesystem_encoding)
241         except (LookupError, AttributeError):
242             raise unittest.SkipTest("This platform does not support the '%s' filesystem encoding "
243                                     "that we are testing for the benefit of a different platform."
244                                     % (self.filesystem_encoding,))
245
246         def call_os_listdir(path):
247             return self.dirlist
248         self.patch(os, 'listdir', call_os_listdir)
249
250         def call_sys_getfilesystemencoding():
251             return self.filesystem_encoding
252         self.patch(sys, 'getfilesystemencoding', call_sys_getfilesystemencoding)
253
254         _reload()
255         filenames = listdir_unicode(u'/dummy')
256
257         self.failUnlessEqual(set([normalize(fname) for fname in filenames]),
258                              set(TEST_FILENAMES))
259
260
261 class StdlibUnicode(unittest.TestCase):
262     """This mainly tests that some of the stdlib functions support Unicode paths, but also that
263     listdir_unicode works for valid filenames."""
264
265     def skip_if_cannot_represent_filename(self, u):
266         enc = get_filesystem_encoding()
267         if not unicode_platform():
268             try:
269                 u.encode(enc)
270             except UnicodeEncodeError:
271                 raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")
272
273     def test_mkdir_open_exists_abspath_listdir_expanduser(self):
274         self.skip_if_cannot_represent_filename(lumiere_nfc)
275
276         try:
277             os.mkdir(lumiere_nfc)
278         except EnvironmentError, e:
279             raise unittest.SkipTest("%r\nIt is possible that the filesystem on which this test is being run "
280                                     "does not support Unicode, even though the platform does." % (e,))
281
282         fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt'
283         open(fn, 'wb').close()
284         self.failUnless(os.path.exists(fn))
285         self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn)))
286         filenames = listdir_unicode(lumiere_nfc)
287
288         # We only require that the listing includes a filename that is canonically equivalent
289         # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent).
290         self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames]))
291
292         expanded = fileutil.expanduser(u"~/" + lumiere_nfc)
293         self.failIfIn(u"~", expanded)
294         self.failUnless(expanded.endswith(lumiere_nfc), expanded)
295
296     def test_open_unrepresentable(self):
297         if unicode_platform():
298             raise unittest.SkipTest("This test is not applicable to platforms that represent filenames as Unicode.")
299
300         enc = get_filesystem_encoding()
301         fn = u'\u2621.txt'
302         try:
303             fn.encode(enc)
304             raise unittest.SkipTest("This test cannot be run unless we know a filename that is not representable.")
305         except UnicodeEncodeError:
306             self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb')
307
308
309 class QuoteOutput(ReallyEqualMixin, unittest.TestCase):
310     def tearDown(self):
311         _reload()
312
313     def _check(self, inp, out, enc, optional_quotes, quote_newlines):
314         out2 = out
315         if optional_quotes:
316             out2 = out2[1:-1]
317         self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quote_newlines=quote_newlines), out)
318         self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
319         if out[0:2] == 'b"':
320             pass
321         elif isinstance(inp, str):
322             self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quote_newlines=quote_newlines), out)
323             self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
324         else:
325             self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quote_newlines=quote_newlines), out)
326             self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2)
327
328     def _test_quote_output_all(self, enc):
329         def check(inp, out, optional_quotes=False, quote_newlines=None):
330             self._check(inp, out, enc, optional_quotes, quote_newlines)
331
332         # optional single quotes
333         check("foo",  "'foo'",  True)
334         check("\\",   "'\\'",   True)
335         check("$\"`", "'$\"`'", True)
336         check("\n",   "'\n'",   True, quote_newlines=False)
337
338         # mandatory single quotes
339         check("\"",   "'\"'")
340
341         # double quotes
342         check("'",    "\"'\"")
343         check("\n",   "\"\\x0a\"", quote_newlines=True)
344         check("\x00", "\"\\x00\"")
345
346         # invalid Unicode and astral planes
347         check(u"\uFDD0\uFDEF",       "\"\\ufdd0\\ufdef\"")
348         check(u"\uDC00\uD800",       "\"\\udc00\\ud800\"")
349         check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"")
350         check(u"\uD800\uDC00",       "\"\\U00010000\"")
351         check(u"\uD800\uDC01",       "\"\\U00010001\"")
352         check(u"\uD801\uDC00",       "\"\\U00010400\"")
353         check(u"\uDBFF\uDFFF",       "\"\\U0010ffff\"")
354         check(u"'\uDBFF\uDFFF",      "\"'\\U0010ffff\"")
355         check(u"\"\uDBFF\uDFFF",     "\"\\\"\\U0010ffff\"")
356
357         # invalid UTF-8
358         check("\xFF",                "b\"\\xff\"")
359         check("\x00\"$\\`\x80\xFF",  "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"")
360
361     def test_quote_output_ascii(self, enc='ascii'):
362         def check(inp, out, optional_quotes=False, quote_newlines=None):
363             self._check(inp, out, enc, optional_quotes, quote_newlines)
364
365         self._test_quote_output_all(enc)
366         check(u"\u00D7",   "\"\\xd7\"")
367         check(u"'\u00D7",  "\"'\\xd7\"")
368         check(u"\"\u00D7", "\"\\\"\\xd7\"")
369         check(u"\u2621",   "\"\\u2621\"")
370         check(u"'\u2621",  "\"'\\u2621\"")
371         check(u"\"\u2621", "\"\\\"\\u2621\"")
372         check(u"\n",       "'\n'",      True, quote_newlines=False)
373         check(u"\n",       "\"\\x0a\"", quote_newlines=True)
374
375     def test_quote_output_latin1(self, enc='latin1'):
376         def check(inp, out, optional_quotes=False, quote_newlines=None):
377             self._check(inp, out.encode('latin1'), enc, optional_quotes, quote_newlines)
378
379         self._test_quote_output_all(enc)
380         check(u"\u00D7",   u"'\u00D7'", True)
381         check(u"'\u00D7",  u"\"'\u00D7\"")
382         check(u"\"\u00D7", u"'\"\u00D7'")
383         check(u"\u00D7\"", u"'\u00D7\"'", True)
384         check(u"\u2621",   u"\"\\u2621\"")
385         check(u"'\u2621",  u"\"'\\u2621\"")
386         check(u"\"\u2621", u"\"\\\"\\u2621\"")
387         check(u"\n",       u"'\n'", True, quote_newlines=False)
388         check(u"\n",       u"\"\\x0a\"", quote_newlines=True)
389
390     def test_quote_output_utf8(self, enc='utf-8'):
391         def check(inp, out, optional_quotes=False, quote_newlines=None):
392             self._check(inp, out.encode('utf-8'), enc, optional_quotes, quote_newlines)
393
394         self._test_quote_output_all(enc)
395         check(u"\u2621",   u"'\u2621'", True)
396         check(u"'\u2621",  u"\"'\u2621\"")
397         check(u"\"\u2621", u"'\"\u2621'")
398         check(u"\u2621\"", u"'\u2621\"'", True)
399         check(u"\n",       u"'\n'", True, quote_newlines=False)
400         check(u"\n",       u"\"\\x0a\"", quote_newlines=True)
401
402     def test_quote_output_default(self):
403         encodingutil.io_encoding = 'ascii'
404         self.test_quote_output_ascii(None)
405
406         encodingutil.io_encoding = 'latin1'
407         self.test_quote_output_latin1(None)
408
409         encodingutil.io_encoding = 'utf-8'
410         self.test_quote_output_utf8(None)
411
412
413 class QuotePaths(ReallyEqualMixin, unittest.TestCase):
414     def test_quote_path(self):
415         self.failUnlessReallyEqual(quote_path([u'foo', u'bar']), "'foo/bar'")
416         self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=True), "'foo/bar'")
417         self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=False), "foo/bar")
418         self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar']), '"foo/\\x0abar"')
419         self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=True), '"foo/\\x0abar"')
420         self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=False), '"foo/\\x0abar"')
421
422         def win32_other(win32, other):
423             return win32 if sys.platform == "win32" else other
424
425         self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo"),
426                                    win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'"))
427         self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=True),
428                                    win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'"))
429         self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=False),
430                                    win32_other("C:\\foo", "\\\\?\\C:\\foo"))
431         self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar"),
432                                    win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'"))
433         self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=True),
434                                    win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'"))
435         self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=False),
436                                    win32_other("\\\\foo\\bar", "\\\\?\\UNC\\foo\\bar"))
437
438
439 class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
440     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
441     argv = 'lumi\xc3\xa8re'
442     platform = 'linux2'
443     filesystem_encoding = 'UTF-8'
444     io_encoding = 'UTF-8'
445     dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
446
447 class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase):
448     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
449     argv = 'lumi\xe8re'
450     platform = 'linux2'
451     filesystem_encoding = 'ISO-8859-1'
452     io_encoding = 'ISO-8859-1'
453     dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
454
455 class Windows(EncodingUtil, unittest.TestCase):
456     uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
457     argv = 'lumi\xc3\xa8re'
458     platform = 'win32'
459     filesystem_encoding = 'mbcs'
460     io_encoding = 'utf-8'
461     dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
462
463 class MacOSXLeopard(EncodingUtil, unittest.TestCase):
464     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
465     output = 'lumi\xc3\xa8re'
466     platform = 'darwin'
467     filesystem_encoding = 'utf-8'
468     io_encoding = 'UTF-8'
469     dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
470
471 class MacOSXLeopard7bit(EncodingUtil, unittest.TestCase):
472     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
473     platform = 'darwin'
474     filesystem_encoding = 'utf-8'
475     io_encoding = 'US-ASCII'
476     dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
477
478 class OpenBSD(EncodingUtil, unittest.TestCase):
479     uname = 'OpenBSD 4.1 GENERIC#187 i386 Intel(R) Celeron(R) CPU 2.80GHz ("GenuineIntel" 686-class)'
480     platform = 'openbsd4'
481     filesystem_encoding = '646'
482     io_encoding = '646'
483     # Oops, I cannot write filenames containing non-ascii characters
484
485
486 class TestToFromStr(ReallyEqualMixin, unittest.TestCase):
487     def test_to_str(self):
488         self.failUnlessReallyEqual(to_str("foo"), "foo")
489         self.failUnlessReallyEqual(to_str("lumi\xc3\xa8re"), "lumi\xc3\xa8re")
490         self.failUnlessReallyEqual(to_str("\xFF"), "\xFF")  # passes through invalid UTF-8 -- is this what we want?
491         self.failUnlessReallyEqual(to_str(u"lumi\u00E8re"), "lumi\xc3\xa8re")
492         self.failUnlessReallyEqual(to_str(None), None)
493
494     def test_from_utf8_or_none(self):
495         self.failUnlessRaises(AssertionError, from_utf8_or_none, u"foo")
496         self.failUnlessReallyEqual(from_utf8_or_none("lumi\xc3\xa8re"), u"lumi\u00E8re")
497         self.failUnlessReallyEqual(from_utf8_or_none(None), None)
498         self.failUnlessRaises(UnicodeDecodeError, from_utf8_or_none, "\xFF")