Correct stringutils->encodingutil patch to be the newer version, rather than the...
authordavid-sarah <david-sarah@jacaranda.org>
Sun, 18 Jul 2010 01:34:35 +0000 (18:34 -0700)
committerdavid-sarah <david-sarah@jacaranda.org>
Sun, 18 Jul 2010 01:34:35 +0000 (18:34 -0700)
src/allmydata/scripts/tahoe_backup.py
src/allmydata/scripts/tahoe_cp.py
src/allmydata/scripts/tahoe_get.py
src/allmydata/scripts/tahoe_put.py
src/allmydata/test/test_backupdb.py
src/allmydata/test/test_cli.py
src/allmydata/test/test_encodingutil.py
src/allmydata/util/encodingutil.py
src/allmydata/util/fileutil.py

index 2525b2fa1dfed38b8ed4a02a99d6f202524d4df2..b71555362765855869b0f4f3a22535bf2dfe9675 100644 (file)
@@ -9,8 +9,7 @@ from allmydata.scripts.common import get_alias, escape_path, DEFAULT_ALIAS, \
 from allmydata.scripts.common_http import do_http, HTTPError, format_http_error
 from allmydata.util import time_format
 from allmydata.scripts import backupdb
-from allmydata.util.encodingutil import quote_output, to_str
-from allmydata.util.fileutil import open_expanduser
+from allmydata.util.encodingutil import listdir_unicode, quote_output, to_str, FilenameEncodingError
 from allmydata.util.assertutil import precondition
 
 
@@ -167,14 +166,14 @@ class BackerUpper:
         compare_contents = {} # childname -> rocap
 
         try:
-            children = os.listdir(localpath)
+            children = listdir_unicode(localpath)
         except EnvironmentError:
             self.directories_skipped += 1
             self.warn("WARNING: permission denied on directory %s" % quote_output(localpath))
             children = []
-        except (UnicodeEncodeError, UnicodeDecodeError):
+        except FilenameEncodingError:
             self.directories_skipped += 1
-            self.warn("WARNING: could not list directory %s due to an encoding error" % quote_output(localpath))
+            self.warn("WARNING: could not list directory %s due to a filename encoding error" % quote_output(localpath))
             children = []
 
         for child in self.options.filter_listdir(children):
@@ -297,7 +296,7 @@ class BackerUpper:
 
         if must_upload:
             self.verboseprint("uploading %s.." % quote_output(childpath))
-            infileobj = open_expanduser(childpath, "rb")
+            infileobj = open(childpath, "rb")
             url = self.options['node-url'] + "uri"
             resp = do_http("PUT", url, infileobj)
             if resp.status not in (200, 201):
index 76fa95ef0a338df02265394b8aa787cc8e984e3f..5ef9de1451d33610a93973feecad4ebd815be14d 100644 (file)
@@ -8,9 +8,8 @@ from allmydata.scripts.common import get_alias, escape_path, \
                                      DefaultAliasMarker, TahoeError
 from allmydata.scripts.common_http import do_http, HTTPError
 from allmydata import uri
-from allmydata.util.encodingutil import unicode_to_url, quote_output, to_str
 from allmydata.util import fileutil
-from allmydata.util.fileutil import open_expanduser, abspath_expanduser
+from allmydata.util.encodingutil import unicode_to_url, listdir_unicode, quote_output, to_str
 from allmydata.util.assertutil import precondition
 
 
@@ -68,7 +67,7 @@ class LocalFileSource:
         return True
 
     def open(self, caps_only):
-        return open_expanduser(self.pathname, "rb")
+        return open(os.path.expanduser(self.pathname), "rb")
 
 
 class LocalFileTarget:
@@ -101,7 +100,7 @@ class LocalDirectorySource:
         if self.children is not None:
             return
         self.children = {}
-        children = os.listdir(self.pathname)
+        children = listdir_unicode(self.pathname)
         for i,n in enumerate(children):
             self.progressfunc("examining %d of %d" % (i, len(children)))
             pn = os.path.join(self.pathname, n)
@@ -129,7 +128,7 @@ class LocalDirectoryTarget:
         if self.children is not None:
             return
         self.children = {}
-        children = os.listdir(self.pathname)
+        children = listdir_unicode(self.pathname)
         for i,n in enumerate(children):
             self.progressfunc("examining %d of %d" % (i, len(children)))
             n = unicode(n)
@@ -512,7 +511,7 @@ class Copier:
         rootcap, path = get_alias(self.aliases, destination_spec, None)
         if rootcap == DefaultAliasMarker:
             # no alias, so this is a local file
-            pathname = abspath_expanduser(path.decode('utf-8'))
+            pathname = os.path.abspath(os.path.expanduser(path.decode('utf-8')))
             if not os.path.exists(pathname):
                 t = LocalMissingTarget(pathname)
             elif os.path.isdir(pathname):
@@ -552,7 +551,7 @@ class Copier:
         rootcap, path = get_alias(self.aliases, source_spec, None)
         if rootcap == DefaultAliasMarker:
             # no alias, so this is a local file
-            pathname = abspath_expanduser(path.decode('utf-8'))
+            pathname = os.path.abspath(os.path.expanduser(path.decode('utf-8')))
             name = os.path.basename(pathname)
             if not os.path.exists(pathname):
                 raise MissingSourceError(source_spec)
index 63032ed48602769879baa9f8194ebc0fa70ed8c1..280d8c052abdd697d06e131c57551e79bbfe9dd6 100644 (file)
@@ -1,9 +1,8 @@
 
-import urllib
+import os, urllib
 from allmydata.scripts.common import get_alias, DEFAULT_ALIAS, escape_path, \
                                      UnknownAliasError
 from allmydata.scripts.common_http import do_http, format_http_error
-from allmydata.util.fileutil import open_expanduser
 
 def get(options):
     nodeurl = options['node-url']
@@ -27,7 +26,7 @@ def get(options):
     resp = do_http("GET", url)
     if resp.status in (200, 201,):
         if to_file:
-            outf = open_expanduser(to_file, "wb")
+            outf = open(os.path.expanduser(to_file), "wb")
         else:
             outf = stdout
         while True:
index d646110ed85de75a8816a8c0a0007e798648cf89..eb578becbbd8ea2ca868c6bc531d9b9587a54cfd 100644 (file)
@@ -1,11 +1,11 @@
 
+import os
 from cStringIO import StringIO
 import urllib
 from allmydata.scripts.common_http import do_http, format_http_success, format_http_error
 from allmydata.scripts.common import get_alias, DEFAULT_ALIAS, escape_path, \
                                      UnknownAliasError
 from allmydata.util.encodingutil import quote_output
-from allmydata.util.fileutil import open_expanduser
 
 def put(options):
     """
@@ -65,7 +65,7 @@ def put(options):
     if mutable:
         url += "?mutable=true"
     if from_file:
-        infileobj = open_expanduser(from_file, "rb")
+        infileobj = open(os.path.expanduser(from_file), "rb")
     else:
         # do_http() can't use stdin directly: for one thing, we need a
         # Content-Length field. So we currently must copy it.
index 1d6d97348e8443cc5df8d952fb85731883463fef..6cd4ffa2ee1d91a58086d93ceb763503fef313b8 100644 (file)
@@ -4,7 +4,7 @@ from StringIO import StringIO
 from twisted.trial import unittest
 
 from allmydata.util import fileutil
-from allmydata.util.encodingutil import get_filesystem_encoding, unicode_platform
+from allmydata.util.encodingutil import listdir_unicode, get_filesystem_encoding, unicode_platform
 from allmydata.util.assertutil import precondition
 from allmydata.scripts import backupdb
 
@@ -249,7 +249,7 @@ class BackupDB(unittest.TestCase):
         self.failUnless(bdb)
 
         self.writeto(u"f\u00f6\u00f6.txt", "foo.txt")
-        files = [fn for fn in os.listdir(unicode(basedir)) if fn.endswith(".txt")]
+        files = [fn for fn in listdir_unicode(unicode(basedir)) if fn.endswith(".txt")]
         self.failUnlessEqual(len(files), 1)
         foo_fn = os.path.join(basedir, files[0])
         #print foo_fn, type(foo_fn)
index f52e468e06894ea61ba8d22e66ededb2e5f8266c..6ee4d1fe76f9df60edb9f68a7b9d2c12bb0ebf5a 100644 (file)
@@ -31,8 +31,8 @@ from twisted.internet import threads # CLI tests use deferToThread
 from twisted.python import usage
 
 from allmydata.util.assertutil import precondition
-from allmydata.util.encodingutil import unicode_platform, quote_output, \
-    get_output_encoding, get_argv_encoding, get_filesystem_encoding, \
+from allmydata.util.encodingutil import listdir_unicode, unicode_platform, \
+    quote_output, get_output_encoding, get_argv_encoding, get_filesystem_encoding, \
     unicode_to_output, to_str
 
 timeout = 480 # deep_check takes 360s on Zandr's linksys box, others take > 240s
@@ -441,7 +441,7 @@ class CLI(CLITestMixin, unittest.TestCase):
         for name in filenames:
             open(os.path.join(unicode(basedir), name), "wb").close()
 
-        for file in os.listdir(unicode(basedir)):
+        for file in listdir_unicode(unicode(basedir)):
             self.failUnlessIn(normalize(file), filenames)
 
 
index 10310ca6b5ef4773bdd27bba7bf570949bc39ffa..9a3c5f33058e370c59d83e39d6e6fa37b999ecc7 100644 (file)
@@ -13,7 +13,9 @@ TEST_FILENAMES = (
 # systems.
 
 if __name__ == "__main__":
-    import sys
+    import sys, os
+    import tempfile
+    import shutil
     import platform
 
     if len(sys.argv) != 2:
@@ -21,7 +23,7 @@ if __name__ == "__main__":
         sys.exit(1)
     
     print
-    print "class MyWeirdOS(StringUtils, unittest.TestCase):"
+    print "class MyWeirdOS(EncodingUtil, unittest.TestCase):"
     print "    uname = '%s'" % ' '.join(platform.uname())
     if sys.platform != "win32":
         print "    argv = %s" % repr(sys.argv[1])
@@ -29,21 +31,39 @@ if __name__ == "__main__":
     print "    filesystem_encoding = '%s'" % sys.getfilesystemencoding()
     print "    output_encoding = '%s'" % sys.stdout.encoding
     print "    argv_encoding = '%s'" % (sys.platform == "win32" and 'ascii' or sys.stdout.encoding)
+
+    try:
+        tmpdir = tempfile.mkdtemp()
+        for fname in TEST_FILENAMES:
+            open(os.path.join(tmpdir, fname), 'w').close() 
+
+        # Use Unicode API under Windows or MacOS X
+        if sys.platform in ('win32', 'darwin'):
+            dirlist = os.listdir(unicode(tmpdir))
+        else:
+            dirlist = os.listdir(tmpdir)
+
+        print "    dirlist = %s" % repr(dirlist)
+    except:
+        print "    # Oops, I cannot write filenames containing non-ascii characters"
     print
 
+    shutil.rmtree(tmpdir)
     sys.exit(0)
 
 from twisted.trial import unittest
 from mock import patch
-import sys, locale
+import os, sys, locale
 
 from allmydata.test.common_util import ReallyEqualMixin
 from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \
-    unicode_to_output, unicode_platform, get_output_encoding, _reload
+    unicode_to_output, unicode_platform, listdir_unicode, FilenameEncodingError, \
+    get_output_encoding, get_filesystem_encoding, _reload
+from allmydata.dirnode import normalize
 
 from twisted.python import usage
 
-class StringUtilsErrors(ReallyEqualMixin, unittest.TestCase):
+class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase):
     def tearDown(self):
         _reload()
 
@@ -103,8 +123,55 @@ class StringUtilsErrors(ReallyEqualMixin, unittest.TestCase):
         _reload()
         self.failUnlessRaises(UnicodeEncodeError, unicode_to_output, lumiere_nfc)
 
+    @patch('os.listdir')
+    def test_no_unicode_normalization(self, mock):
+        # Pretend to run on a Unicode platform.
+        # We normalized to NFC in 1.7beta, but we now don't.
+        orig_platform = sys.platform
+        try:
+            sys.platform = 'darwin'
+            mock.return_value = [Artonwall_nfd]
+            _reload()
+            self.failUnlessReallyEqual(listdir_unicode(u'/dummy'), [Artonwall_nfd])
+        finally:
+            sys.platform = orig_platform
 
-class StringUtils(ReallyEqualMixin):
+# The following tests apply only to platforms that don't store filenames as
+# Unicode entities on the filesystem.
+class EncodingUtilNonUnicodePlatform(unittest.TestCase):
+    def setUp(self):
+        # Mock sys.platform because unicode_platform() uses it
+        self.original_platform = sys.platform
+        sys.platform = 'linux'
+
+    def tearDown(self):
+        sys.platform = self.original_platform
+        _reload()
+
+    @patch('sys.getfilesystemencoding')
+    @patch('os.listdir')
+    def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
+        # What happens if latin1-encoded filenames are encountered on an UTF-8
+        # filesystem?
+        mock_listdir.return_value = [
+            lumiere_nfc.encode('utf-8'),
+            lumiere_nfc.encode('latin1')]
+
+        mock_getfilesystemencoding.return_value = 'utf-8'
+        _reload()
+        self.failUnlessRaises(FilenameEncodingError,
+                              listdir_unicode,
+                              u'/dummy')
+        
+        # We're trying to list a directory whose name cannot be represented in
+        # the filesystem encoding.  This should fail.
+        mock_getfilesystemencoding.return_value = 'ascii'
+        _reload()
+        self.failUnlessRaises(FilenameEncodingError,
+                              listdir_unicode,
+                              u'/' + lumiere_nfc)
+
+class EncodingUtil(ReallyEqualMixin):
     def setUp(self):
         # Mock sys.platform because unicode_platform() uses it
         self.original_platform = sys.platform
@@ -148,8 +215,74 @@ class StringUtils(ReallyEqualMixin):
         _reload()
         self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform])
  
+    @patch('sys.getfilesystemencoding')
+    @patch('os.listdir')
+    def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
+        if 'dirlist' not in dir(self):
+            return
+
+        try:
+            u"test".encode(self.filesystem_encoding)
+        except (LookupError, AttributeError):
+            raise unittest.SkipTest("This platform does not support the '%s' filesystem encoding "
+                                    "that we are testing for the benefit of a different platform."
+                                    % (self.filesystem_encoding,))
+
+        mock_listdir.return_value = self.dirlist
+        mock_getfilesystemencoding.return_value = self.filesystem_encoding
+
+        _reload()
+        filenames = listdir_unicode(u'/dummy')
+
+        self.failUnlessEqual(set([normalize(fname) for fname in filenames]),
+                             set(TEST_FILENAMES))
+
+
+class StdlibUnicode(unittest.TestCase):
+    """This mainly tests that some of the stdlib functions support Unicode paths, but also that
+    listdir_unicode works for valid filenames."""
+
+    def skip_if_cannot_represent_filename(self, u):
+        enc = get_filesystem_encoding()
+        if not unicode_platform():
+            try:
+                u.encode(enc)
+            except UnicodeEncodeError:
+                raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.")
+
+    def test_mkdir_open_exists_abspath_listdir_expanduser(self):
+        self.skip_if_cannot_represent_filename(lumiere_nfc)
+
+        try:
+            os.mkdir(lumiere_nfc)
+        except EnvironmentError, e:
+            raise unittest.SkipTest("%r\nIt is possible that the filesystem on which this test is being run "
+                                    "does not support Unicode, even though the platform does." % (e,))
+
+        fn = lumiere_nfc + '/' + lumiere_nfc + '.txt'
+        open(fn, 'wb').close()
+        self.failUnless(os.path.exists(fn))
+        self.failUnless(os.path.exists(os.path.abspath(fn)))
+        filenames = listdir_unicode(lumiere_nfc)
+
+        # We only require that the listing includes a filename that is canonically equivalent
+        # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent).
+        self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames]))
+
+        expanded = os.path.expanduser("~/" + lumiere_nfc)
+        self.failIfIn("~", expanded)
+        self.failUnless(expanded.endswith(lumiere_nfc), expanded)
+
+    @patch('sys.getfilesystemencoding')
+    def test_open_unrepresentable(self, mock):
+        if unicode_platform():
+            raise unittest.SkipTest("This test is not applicable to platforms that represent filenames as Unicode.")
+
+        mock.return_value = 'ascii'
+        self.failUnlessRaises(UnicodeEncodeError, open, lumiere_nfc, 'rb')
+
 
-class UbuntuKarmicUTF8(StringUtils, unittest.TestCase):
+class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase):
     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
     output = 'lumi\xc3\xa8re'
     argv = 'lumi\xc3\xa8re'
@@ -157,8 +290,9 @@ class UbuntuKarmicUTF8(StringUtils, unittest.TestCase):
     filesystem_encoding = 'UTF-8'
     output_encoding = 'UTF-8'
     argv_encoding = 'UTF-8'
+    dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
 
-class UbuntuKarmicLatin1(StringUtils, unittest.TestCase):
+class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase):
     uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
     output = 'lumi\xe8re'
     argv = 'lumi\xe8re'
@@ -166,32 +300,36 @@ class UbuntuKarmicLatin1(StringUtils, unittest.TestCase):
     filesystem_encoding = 'ISO-8859-1'
     output_encoding = 'ISO-8859-1'
     argv_encoding = 'ISO-8859-1'
+    dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
 
-class WindowsXP(StringUtils, unittest.TestCase):
+class WindowsXP(EncodingUtil, unittest.TestCase):
     uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
     output = 'lumi\x8are'
     platform = 'win32'
     filesystem_encoding = 'mbcs'
     output_encoding = 'cp850'
     argv_encoding = 'ascii'
+    dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
 
-class WindowsXP_UTF8(StringUtils, unittest.TestCase):
+class WindowsXP_UTF8(EncodingUtil, unittest.TestCase):
     uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
     output = 'lumi\xc3\xa8re'
     platform = 'win32'
     filesystem_encoding = 'mbcs'
     output_encoding = 'cp65001'
     argv_encoding = 'ascii'
+    dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
 
-class WindowsVista(StringUtils, unittest.TestCase):
+class WindowsVista(EncodingUtil, unittest.TestCase):
     uname = 'Windows Vista 6.0.6000 x86 x86 Family 6 Model 15 Stepping 11, GenuineIntel'
     output = 'lumi\x8are'
     platform = 'win32'
     filesystem_encoding = 'mbcs'
     output_encoding = 'cp850'
     argv_encoding = 'ascii'
+    dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
 
-class MacOSXLeopard(StringUtils, unittest.TestCase):
+class MacOSXLeopard(EncodingUtil, unittest.TestCase):
     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
     output = 'lumi\xc3\xa8re'
     argv = 'lumi\xc3\xa8re'
@@ -199,15 +337,17 @@ class MacOSXLeopard(StringUtils, unittest.TestCase):
     filesystem_encoding = 'utf-8'
     output_encoding = 'UTF-8'
     argv_encoding = 'UTF-8'
+    dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
 
-class MacOSXLeopard7bit(StringUtils, unittest.TestCase):
+class MacOSXLeopard7bit(EncodingUtil, unittest.TestCase):
     uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
     platform = 'darwin'
     filesystem_encoding = 'utf-8'
     output_encoding = 'US-ASCII'
     argv_encoding = 'US-ASCII'
+    dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
 
-class OpenBSD(StringUtils, unittest.TestCase):
+class OpenBSD(EncodingUtil, unittest.TestCase):
     uname = 'OpenBSD 4.1 GENERIC#187 i386 Intel(R) Celeron(R) CPU 2.80GHz ("GenuineIntel" 686-class)'
     platform = 'openbsd4'
     filesystem_encoding = '646'
index 61d58fef0b2d1a59711601990087728a9d654cb5..e5787851505c70b79198e46bfb482a6ebfc8aed0 100644 (file)
@@ -4,6 +4,7 @@ unicode and back.
 """
 
 import sys
+import os
 import re
 from allmydata.util.assertutil import precondition
 from twisted.python import usage
@@ -173,3 +174,44 @@ def unicode_platform():
     Does the current platform handle Unicode filenames natively?
     """
     return is_unicode_platform
+
+class FilenameEncodingError(Exception):
+    """
+    Filename cannot be encoded using the current encoding of your filesystem
+    (%s). Please configure your locale correctly or rename this file.
+    """
+    pass
+
+def listdir_unicode_fallback(path):
+    """
+    This function emulates a fallback Unicode API similar to one available
+    under Windows or MacOS X.
+
+    If badly encoded filenames are encountered, an exception is raised.
+    """
+    precondition(isinstance(path, unicode), path)
+
+    try:
+        byte_path = path.encode(filesystem_encoding)
+    except (UnicodeEncodeError, UnicodeDecodeError):
+        raise FilenameEncodingError(path)
+
+    try:
+        return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)]
+    except UnicodeDecodeError:
+        raise FilenameEncodingError(fn)
+
+def listdir_unicode(path):
+    """
+    Wrapper around listdir() which provides safe access to the convenient
+    Unicode API even under platforms that don't provide one natively.
+    """
+    precondition(isinstance(path, unicode), path)
+
+    # On Windows and MacOS X, the Unicode API is used
+    # On other platforms (ie. Unix systems), the byte-level API is used
+
+    if is_unicode_platform:
+        return os.listdir(path)
+    else:
+        return listdir_unicode_fallback(path)
index bd9deb43115daee48908d81baa90932edc48c5ac..46e1c9bfeafd47f351d4852846381a4911f0826e 100644 (file)
@@ -211,7 +211,7 @@ def read(path):
 
 def put_file(pathname, inf):
     # TODO: create temporary file and move into place?
-    outf = open_expanduser(pathname, "wb")
+    outf = open(os.path.expanduser(pathname), "wb")
     try:
         while True:
             data = inf.read(32768)
@@ -220,11 +220,3 @@ def put_file(pathname, inf):
             outf.write(data)
     finally:
         outf.close()
-
-def open_expanduser(path, mode):
-    assert isinstance(path, unicode), path
-    return open(os.path.expanduser(path), mode)
-
-def abspath_expanduser(path):
-    assert isinstance(path, unicode), path
-    return os.path.abspath(os.path.expanduser(path))