From: Francois Deppierraz Date: Thu, 20 May 2010 00:41:05 +0000 (-0700) Subject: stringutils.py: Unicode helper functions + associated tests X-Git-Url: https://git.rkrishnan.org/simplejson/components/%22file:/?a=commitdiff_plain;h=d0ed14e1bb69ec214a8af5c5376d67562c9995e2;p=tahoe-lafs%2Ftahoe-lafs.git stringutils.py: Unicode helper functions + associated tests This file contains a bunch of helper functions which converts unicode string from and to argv, filenames and stdout. --- diff --git a/src/allmydata/test/test_stringutils.py b/src/allmydata/test/test_stringutils.py new file mode 100644 index 00000000..e2dba0c5 --- /dev/null +++ b/src/allmydata/test/test_stringutils.py @@ -0,0 +1,259 @@ +# coding=utf-8 + +TEST_FILENAMES = ( + u'Ärtonwall.mp3', + u'test_file', + u'Blah blah.txt', +) + +# The following main helps to generate a test class for other operating +# systems. + +if __name__ == "__main__": + import sys, os + import tempfile + import shutil + import platform + + if len(sys.argv) != 2: + print "Usage: %s lumière" % sys.argv[0] + sys.exit(1) + + print + print "class MyWeirdOS(StringUtils, unittest.TestCase):" + print " uname = '%s'" % ' '.join(platform.uname()) + print " argv = %s" % repr(sys.argv[1]) + print " platform = '%s'" % sys.platform + print " filesystemencoding = '%s'" % sys.getfilesystemencoding() + print " stdoutencoding = '%s'" % sys.stdout.encoding + + try: + tmpdir = tempfile.mkdtemp() + for fname in TEST_FILENAMES: + open(os.path.join(tmpdir, fname), 'w').close() + + # Use Unicode API under Windows or MacOS X + if sys.platform in ('win32', 'darwin'): + dirlist = os.listdir(unicode(tmpdir)) + else: + dirlist = os.listdir(tmpdir) + + print " dirlist = %s" % repr(dirlist) + except: + print " # Oops, I cannot write filenames containing non-ascii characters" + print + + shutil.rmtree(tmpdir) + sys.exit(0) + +from twisted.trial import unittest +from mock import patch +import sys + +from allmydata.util.stringutils import argv_to_unicode, unicode_to_url, \ + unicode_to_stdout, unicode_platform, listdir_unicode, open_unicode, \ + FilenameEncodingError, get_term_encoding +from twisted.python import usage + +class StringUtilsErrors(unittest.TestCase): + @patch('sys.stdout') + def test_get_term_encoding(self, mock): + mock.encoding = None + + self.failUnlessEqual(get_term_encoding(), 'ascii') + + @patch('sys.stdout') + def test_argv_to_unicode(self, mock): + mock.encoding = 'utf-8' + + self.failUnlessRaises(usage.UsageError, + argv_to_unicode, + u'lumière'.encode('latin1')) + + def test_unicode_to_url(self): + pass + + @patch('sys.stdout') + def test_unicode_to_stdout(self, mock): + # Encoding koi8-r cannot represent 'è' + mock.encoding = 'koi8-r' + self.failUnlessEqual(unicode_to_stdout(u'lumière'), 'lumi?re') + + @patch('os.listdir') + def test_unicode_normalization(self, mock): + # Pretend to run on an Unicode platform such as Windows + orig_platform = sys.platform + sys.platform = 'win32' + + mock.return_value = [u'A\u0308rtonwall.mp3'] + self.failUnlessEqual(listdir_unicode(u'/dummy'), [u'\xc4rtonwall.mp3']) + + sys.platform = orig_platform + +# The following tests applies only to platforms which don't store filenames as +# Unicode entities on the filesystem. +class StringUtilsNonUnicodePlatform(unittest.TestCase): + def setUp(self): + # Mock sys.platform because unicode_platform() uses it + self.original_platform = sys.platform + sys.platform = 'linux' + + def tearDown(self): + sys.platform = self.original_platform + + @patch('sys.getfilesystemencoding') + @patch('os.listdir') + def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding): + # What happen if a latin1-encoded filenames is encountered on an UTF-8 + # filesystem? + mock_listdir.return_value = [ + u'lumière'.encode('utf-8'), + u'lumière'.encode('latin1')] + + mock_getfilesystemencoding.return_value = 'utf-8' + + self.failUnlessRaises(FilenameEncodingError, + listdir_unicode, + u'/dummy') + + # We're trying to list a directory whose name cannot be represented in + # the filesystem encoding. This should fail. + mock_getfilesystemencoding.return_value = 'ascii' + self.failUnlessRaises(FilenameEncodingError, + listdir_unicode, + u'/lumière') + + @patch('sys.getfilesystemencoding') + def test_open_unicode(self, mock): + mock.return_value = 'ascii' + + self.failUnlessRaises(FilenameEncodingError, + open_unicode, + u'lumière') + +class StringUtils(): + def setUp(self): + # Mock sys.platform because unicode_platform() uses it + self.original_platform = sys.platform + sys.platform = self.platform + + def tearDown(self): + sys.platform = self.original_platform + + @patch('sys.stdout') + def test_argv_to_unicode(self, mock): + if 'argv' not in dir(self): + raise unittest.SkipTest("There's no way to pass non-ASCII arguments in CLI on this (mocked) platform") + + mock.encoding = self.stdoutencoding + + argu = u'lumière' + argv = self.argv + + self.failUnlessEqual(argv_to_unicode(argv), argu) + + def test_unicode_to_url(self): + self.failUnless(unicode_to_url(u'lumière'), u'lumière'.encode('utf-8')) + + @patch('sys.stdout') + def test_unicode_to_stdout(self, mock): + if 'argv' not in dir(self): + raise unittest.SkipTest("There's no way to pass non-ASCII arguments in CLI on this (mocked) platform") + + mock.encoding = self.stdoutencoding + self.failUnlessEqual(unicode_to_stdout(u'lumière'), self.argv) + + def test_unicode_platform(self): + matrix = { + 'linux2': False, + 'win32': True, + 'darwin': True, + } + + self.failUnlessEqual(unicode_platform(), matrix[self.platform]) + + @patch('sys.getfilesystemencoding') + @patch('os.listdir') + def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding): + + mock_listdir.return_value = self.dirlist + mock_getfilesystemencoding.return_value = self.filesystemencoding + + filenames = listdir_unicode(u'/dummy') + + for fname in TEST_FILENAMES: + self.failUnless(isinstance(fname, unicode)) + + if fname not in filenames: + self.fail("Cannot find %r in %r" % (fname, filenames)) + + @patch('os.open') + def test_open_unicode(self, mock): + + self.failUnlessRaises(IOError, + open_unicode, + u'/dummy_directory/lumière.txt') + + +class UbuntuKarmicUTF8(StringUtils, unittest.TestCase): + uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64' + argv = 'lumi\xc3\xa8re' + platform = 'linux2' + filesystemencoding = 'UTF-8' + stdoutencoding = 'UTF-8' + dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt'] + + +class UbuntuKarmicLatin1(StringUtils, unittest.TestCase): + uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64' + argv = 'lumi\xe8re' + platform = 'linux2' + filesystemencoding = 'ISO-8859-1' + stdoutencoding = 'ISO-8859-1' + dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3'] + +class WindowsXP(StringUtils, unittest.TestCase): + uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD' + argv = 'lumi\xe8re' + platform = 'win32' + filesystemencoding = 'mbcs' + stdoutencoding = 'cp850' + dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3'] + + todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565." + +class WindowsXP_UTF8(StringUtils, unittest.TestCase): + uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD' + argv = 'lumi\xe8re' + platform = 'win32' + filesystemencoding = 'mbcs' + stdoutencoding = 'cp65001' + dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3'] + + todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565." + +class WindowsVista(StringUtils, unittest.TestCase): + uname = 'Windows Vista 6.0.6000 x86 x86 Family 6 Model 15 Stepping 11, GenuineIntel' + argv = 'lumi\xe8re' + platform = 'win32' + filesystemencoding = 'mbcs' + stdoutencoding = 'cp850' + dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3'] + + todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565." + +class MacOSXLeopard(StringUtils, unittest.TestCase): + uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc' + argv = 'lumi\xc3\xa8re' + platform = 'darwin' + filesystemencoding = 'utf-8' + stdoutencoding = 'UTF-8' + dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file'] + +class MacOSXLeopard7bit(StringUtils, unittest.TestCase): + uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc' + #argv = 'lumiere' + platform = 'darwin' + filesystemencoding = 'utf-8' + stdoutencoding = 'US-ASCII' + dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file'] diff --git a/src/allmydata/util/stringutils.py b/src/allmydata/util/stringutils.py new file mode 100644 index 00000000..96478561 --- /dev/null +++ b/src/allmydata/util/stringutils.py @@ -0,0 +1,128 @@ +""" +Functions used to convert inputs from whatever encoding used in the system to +unicode and back. +""" + +import sys +import os +import unicodedata +from allmydata.util.assertutil import precondition +from twisted.python import usage + +def get_term_encoding(): + """ + Returns expected encoding for writing to the terminal and reading + arguments from the command-line. + """ + + if sys.stdout.encoding == None: + return 'ascii' + else: + return sys.stdout.encoding + +def argv_to_unicode(s): + """ + Decode given argv element to unicode. + """ + # Try to decode the command-line argument with the encoding returned by + # get_term_encoding(), if this fails print an error message to the user. + + precondition(isinstance(s, str), s) + + try: + return unicode(s, get_term_encoding()) + except UnicodeDecodeError: + raise usage.UsageError("Argument '%s' cannot be decoded as %s." % + (s, get_term_encoding())) + +def unicode_to_url(s): + """ + Encode an unicode object used in an URL. + """ + # According to RFC 2718, non-ascii characters in url's must be UTF-8 encoded. + + precondition(isinstance(s, unicode), s) + return s.encode('utf-8') + +def unicode_to_stdout(s): + """ + Encode an unicode object for representation on stdout. + """ + + precondition(isinstance(s, unicode), s) + return s.encode(get_term_encoding(), 'replace') + +def unicode_platform(): + """ + Does the current platform handle Unicode filenames natively ? + """ + + return sys.platform in ('win32', 'darwin') + +class FilenameEncodingError(Exception): + """ + Filename cannot be encoded using the current encoding of your filesystem + (%s). Please configure your locale correctly or rename this file. + """ + + pass + +def listdir_unicode_unix(path): + """ + This function emulates an Unicode API under Unix similar to one available + under Windows or MacOS X. + + If badly encoded filenames are encountered, an exception is raised. + """ + precondition(isinstance(path, unicode), path) + + encoding = sys.getfilesystemencoding() + try: + byte_path = path.encode(encoding) + except UnicodeEncodeError: + raise FilenameEncodingError(path) + + try: + return [unicode(fn, encoding) for fn in os.listdir(byte_path)] + except UnicodeDecodeError: + raise FilenameEncodingError(fn) + +def listdir_unicode(path, encoding = None): + """ + Wrapper around listdir() which provides safe access to the convenient + Unicode API even under Unix. + """ + + precondition(isinstance(path, unicode), path) + + # On Windows and MacOS X, the Unicode API is used + if unicode_platform(): + dirlist = os.listdir(path) + + # On other platforms (ie. Unix systems), the byte-level API is used + else: + dirlist = listdir_unicode_unix(path) + + # Normalize the resulting unicode filenames + # + # This prevents different OS from generating non-equal unicode strings for + # the same filename representation + return [unicodedata.normalize('NFC', fname) for fname in dirlist] + +def open_unicode(path, mode='r'): + """ + Wrapper around open() which provides safe access to the convenient Unicode + API even under Unix. + """ + + precondition(isinstance(path, unicode), path) + + if unicode_platform(): + return open(path, mode) + else: + encoding = sys.getfilesystemencoding() + + try: + return open(path.encode(encoding), mode) + except UnicodeEncodeError: + raise FilenameEncodingError(path)