--- /dev/null
+# coding=utf-8
+
+TEST_FILENAMES = (
+ u'Ärtonwall.mp3',
+ u'test_file',
+ u'Blah blah.txt',
+)
+
+# The following main helps to generate a test class for other operating
+# systems.
+
+if __name__ == "__main__":
+ import sys, os
+ import tempfile
+ import shutil
+ import platform
+
+ if len(sys.argv) != 2:
+ print "Usage: %s lumière" % sys.argv[0]
+ sys.exit(1)
+
+ print
+ print "class MyWeirdOS(StringUtils, unittest.TestCase):"
+ print " uname = '%s'" % ' '.join(platform.uname())
+ print " argv = %s" % repr(sys.argv[1])
+ print " platform = '%s'" % sys.platform
+ print " filesystemencoding = '%s'" % sys.getfilesystemencoding()
+ print " stdoutencoding = '%s'" % sys.stdout.encoding
+
+ try:
+ tmpdir = tempfile.mkdtemp()
+ for fname in TEST_FILENAMES:
+ open(os.path.join(tmpdir, fname), 'w').close()
+
+ # Use Unicode API under Windows or MacOS X
+ if sys.platform in ('win32', 'darwin'):
+ dirlist = os.listdir(unicode(tmpdir))
+ else:
+ dirlist = os.listdir(tmpdir)
+
+ print " dirlist = %s" % repr(dirlist)
+ except:
+ print " # Oops, I cannot write filenames containing non-ascii characters"
+ print
+
+ shutil.rmtree(tmpdir)
+ sys.exit(0)
+
+from twisted.trial import unittest
+from mock import patch
+import sys
+
+from allmydata.util.stringutils import argv_to_unicode, unicode_to_url, \
+ unicode_to_stdout, unicode_platform, listdir_unicode, open_unicode, \
+ FilenameEncodingError, get_term_encoding
+from twisted.python import usage
+
+class StringUtilsErrors(unittest.TestCase):
+ @patch('sys.stdout')
+ def test_get_term_encoding(self, mock):
+ mock.encoding = None
+
+ self.failUnlessEqual(get_term_encoding(), 'ascii')
+
+ @patch('sys.stdout')
+ def test_argv_to_unicode(self, mock):
+ mock.encoding = 'utf-8'
+
+ self.failUnlessRaises(usage.UsageError,
+ argv_to_unicode,
+ u'lumière'.encode('latin1'))
+
+ def test_unicode_to_url(self):
+ pass
+
+ @patch('sys.stdout')
+ def test_unicode_to_stdout(self, mock):
+ # Encoding koi8-r cannot represent 'è'
+ mock.encoding = 'koi8-r'
+ self.failUnlessEqual(unicode_to_stdout(u'lumière'), 'lumi?re')
+
+ @patch('os.listdir')
+ def test_unicode_normalization(self, mock):
+ # Pretend to run on an Unicode platform such as Windows
+ orig_platform = sys.platform
+ sys.platform = 'win32'
+
+ mock.return_value = [u'A\u0308rtonwall.mp3']
+ self.failUnlessEqual(listdir_unicode(u'/dummy'), [u'\xc4rtonwall.mp3'])
+
+ sys.platform = orig_platform
+
+# The following tests applies only to platforms which don't store filenames as
+# Unicode entities on the filesystem.
+class StringUtilsNonUnicodePlatform(unittest.TestCase):
+ def setUp(self):
+ # Mock sys.platform because unicode_platform() uses it
+ self.original_platform = sys.platform
+ sys.platform = 'linux'
+
+ def tearDown(self):
+ sys.platform = self.original_platform
+
+ @patch('sys.getfilesystemencoding')
+ @patch('os.listdir')
+ def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
+ # What happen if a latin1-encoded filenames is encountered on an UTF-8
+ # filesystem?
+ mock_listdir.return_value = [
+ u'lumière'.encode('utf-8'),
+ u'lumière'.encode('latin1')]
+
+ mock_getfilesystemencoding.return_value = 'utf-8'
+
+ self.failUnlessRaises(FilenameEncodingError,
+ listdir_unicode,
+ u'/dummy')
+
+ # We're trying to list a directory whose name cannot be represented in
+ # the filesystem encoding. This should fail.
+ mock_getfilesystemencoding.return_value = 'ascii'
+ self.failUnlessRaises(FilenameEncodingError,
+ listdir_unicode,
+ u'/lumière')
+
+ @patch('sys.getfilesystemencoding')
+ def test_open_unicode(self, mock):
+ mock.return_value = 'ascii'
+
+ self.failUnlessRaises(FilenameEncodingError,
+ open_unicode,
+ u'lumière')
+
+class StringUtils():
+ def setUp(self):
+ # Mock sys.platform because unicode_platform() uses it
+ self.original_platform = sys.platform
+ sys.platform = self.platform
+
+ def tearDown(self):
+ sys.platform = self.original_platform
+
+ @patch('sys.stdout')
+ def test_argv_to_unicode(self, mock):
+ if 'argv' not in dir(self):
+ raise unittest.SkipTest("There's no way to pass non-ASCII arguments in CLI on this (mocked) platform")
+
+ mock.encoding = self.stdoutencoding
+
+ argu = u'lumière'
+ argv = self.argv
+
+ self.failUnlessEqual(argv_to_unicode(argv), argu)
+
+ def test_unicode_to_url(self):
+ self.failUnless(unicode_to_url(u'lumière'), u'lumière'.encode('utf-8'))
+
+ @patch('sys.stdout')
+ def test_unicode_to_stdout(self, mock):
+ if 'argv' not in dir(self):
+ raise unittest.SkipTest("There's no way to pass non-ASCII arguments in CLI on this (mocked) platform")
+
+ mock.encoding = self.stdoutencoding
+ self.failUnlessEqual(unicode_to_stdout(u'lumière'), self.argv)
+
+ def test_unicode_platform(self):
+ matrix = {
+ 'linux2': False,
+ 'win32': True,
+ 'darwin': True,
+ }
+
+ self.failUnlessEqual(unicode_platform(), matrix[self.platform])
+
+ @patch('sys.getfilesystemencoding')
+ @patch('os.listdir')
+ def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
+
+ mock_listdir.return_value = self.dirlist
+ mock_getfilesystemencoding.return_value = self.filesystemencoding
+
+ filenames = listdir_unicode(u'/dummy')
+
+ for fname in TEST_FILENAMES:
+ self.failUnless(isinstance(fname, unicode))
+
+ if fname not in filenames:
+ self.fail("Cannot find %r in %r" % (fname, filenames))
+
+ @patch('os.open')
+ def test_open_unicode(self, mock):
+
+ self.failUnlessRaises(IOError,
+ open_unicode,
+ u'/dummy_directory/lumière.txt')
+
+
+class UbuntuKarmicUTF8(StringUtils, unittest.TestCase):
+ uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
+ argv = 'lumi\xc3\xa8re'
+ platform = 'linux2'
+ filesystemencoding = 'UTF-8'
+ stdoutencoding = 'UTF-8'
+ dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
+
+
+class UbuntuKarmicLatin1(StringUtils, unittest.TestCase):
+ uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
+ argv = 'lumi\xe8re'
+ platform = 'linux2'
+ filesystemencoding = 'ISO-8859-1'
+ stdoutencoding = 'ISO-8859-1'
+ dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
+
+class WindowsXP(StringUtils, unittest.TestCase):
+ uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
+ argv = 'lumi\xe8re'
+ platform = 'win32'
+ filesystemencoding = 'mbcs'
+ stdoutencoding = 'cp850'
+ dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
+
+ todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565."
+
+class WindowsXP_UTF8(StringUtils, unittest.TestCase):
+ uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
+ argv = 'lumi\xe8re'
+ platform = 'win32'
+ filesystemencoding = 'mbcs'
+ stdoutencoding = 'cp65001'
+ dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
+
+ todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565."
+
+class WindowsVista(StringUtils, unittest.TestCase):
+ uname = 'Windows Vista 6.0.6000 x86 x86 Family 6 Model 15 Stepping 11, GenuineIntel'
+ argv = 'lumi\xe8re'
+ platform = 'win32'
+ filesystemencoding = 'mbcs'
+ stdoutencoding = 'cp850'
+ dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
+
+ todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565."
+
+class MacOSXLeopard(StringUtils, unittest.TestCase):
+ uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
+ argv = 'lumi\xc3\xa8re'
+ platform = 'darwin'
+ filesystemencoding = 'utf-8'
+ stdoutencoding = 'UTF-8'
+ dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
+
+class MacOSXLeopard7bit(StringUtils, unittest.TestCase):
+ uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
+ #argv = 'lumiere'
+ platform = 'darwin'
+ filesystemencoding = 'utf-8'
+ stdoutencoding = 'US-ASCII'
+ dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
--- /dev/null
+"""
+Functions used to convert inputs from whatever encoding used in the system to
+unicode and back.
+"""
+
+import sys
+import os
+import unicodedata
+from allmydata.util.assertutil import precondition
+from twisted.python import usage
+
+def get_term_encoding():
+ """
+ Returns expected encoding for writing to the terminal and reading
+ arguments from the command-line.
+ """
+
+ if sys.stdout.encoding == None:
+ return 'ascii'
+ else:
+ return sys.stdout.encoding
+
+def argv_to_unicode(s):
+ """
+ Decode given argv element to unicode.
+ """
+ # Try to decode the command-line argument with the encoding returned by
+ # get_term_encoding(), if this fails print an error message to the user.
+
+ precondition(isinstance(s, str), s)
+
+ try:
+ return unicode(s, get_term_encoding())
+ except UnicodeDecodeError:
+ raise usage.UsageError("Argument '%s' cannot be decoded as %s." %
+ (s, get_term_encoding()))
+
+def unicode_to_url(s):
+ """
+ Encode an unicode object used in an URL.
+ """
+ # According to RFC 2718, non-ascii characters in url's must be UTF-8 encoded.
+
+ precondition(isinstance(s, unicode), s)
+ return s.encode('utf-8')
+
+def unicode_to_stdout(s):
+ """
+ Encode an unicode object for representation on stdout.
+ """
+
+ precondition(isinstance(s, unicode), s)
+ return s.encode(get_term_encoding(), 'replace')
+
+def unicode_platform():
+ """
+ Does the current platform handle Unicode filenames natively ?
+ """
+
+ return sys.platform in ('win32', 'darwin')
+
+class FilenameEncodingError(Exception):
+ """
+ Filename cannot be encoded using the current encoding of your filesystem
+ (%s). Please configure your locale correctly or rename this file.
+ """
+
+ pass
+
+def listdir_unicode_unix(path):
+ """
+ This function emulates an Unicode API under Unix similar to one available
+ under Windows or MacOS X.
+
+ If badly encoded filenames are encountered, an exception is raised.
+ """
+ precondition(isinstance(path, unicode), path)
+
+ encoding = sys.getfilesystemencoding()
+ try:
+ byte_path = path.encode(encoding)
+ except UnicodeEncodeError:
+ raise FilenameEncodingError(path)
+
+ try:
+ return [unicode(fn, encoding) for fn in os.listdir(byte_path)]
+ except UnicodeDecodeError:
+ raise FilenameEncodingError(fn)
+
+def listdir_unicode(path, encoding = None):
+ """
+ Wrapper around listdir() which provides safe access to the convenient
+ Unicode API even under Unix.
+ """
+
+ precondition(isinstance(path, unicode), path)
+
+ # On Windows and MacOS X, the Unicode API is used
+ if unicode_platform():
+ dirlist = os.listdir(path)
+
+ # On other platforms (ie. Unix systems), the byte-level API is used
+ else:
+ dirlist = listdir_unicode_unix(path)
+
+ # Normalize the resulting unicode filenames
+ #
+ # This prevents different OS from generating non-equal unicode strings for
+ # the same filename representation
+ return [unicodedata.normalize('NFC', fname) for fname in dirlist]
+
+def open_unicode(path, mode='r'):
+ """
+ Wrapper around open() which provides safe access to the convenient Unicode
+ API even under Unix.
+ """
+
+ precondition(isinstance(path, unicode), path)
+
+ if unicode_platform():
+ return open(path, mode)
+ else:
+ encoding = sys.getfilesystemencoding()
+
+ try:
+ return open(path.encode(encoding), mode)
+ except UnicodeEncodeError:
+ raise FilenameEncodingError(path)