From: david-sarah Date: Sun, 18 Jul 2010 01:34:35 +0000 (-0700) Subject: Correct stringutils->encodingutil patch to be the newer version, rather than the... X-Git-Url: https://git.rkrishnan.org/pf/content/%22news.html/COPYING.TGPPL.html?a=commitdiff_plain;h=a8161c915a30e18c44166dba486923874d271e3f;p=tahoe-lafs%2Ftahoe-lafs.git Correct stringutils->encodingutil patch to be the newer version, rather than the old version that was committed in error. --- diff --git a/src/allmydata/scripts/tahoe_backup.py b/src/allmydata/scripts/tahoe_backup.py index 2525b2fa..b7155536 100644 --- a/src/allmydata/scripts/tahoe_backup.py +++ b/src/allmydata/scripts/tahoe_backup.py @@ -9,8 +9,7 @@ from allmydata.scripts.common import get_alias, escape_path, DEFAULT_ALIAS, \ from allmydata.scripts.common_http import do_http, HTTPError, format_http_error from allmydata.util import time_format from allmydata.scripts import backupdb -from allmydata.util.encodingutil import quote_output, to_str -from allmydata.util.fileutil import open_expanduser +from allmydata.util.encodingutil import listdir_unicode, quote_output, to_str, FilenameEncodingError from allmydata.util.assertutil import precondition @@ -167,14 +166,14 @@ class BackerUpper: compare_contents = {} # childname -> rocap try: - children = os.listdir(localpath) + children = listdir_unicode(localpath) except EnvironmentError: self.directories_skipped += 1 self.warn("WARNING: permission denied on directory %s" % quote_output(localpath)) children = [] - except (UnicodeEncodeError, UnicodeDecodeError): + except FilenameEncodingError: self.directories_skipped += 1 - self.warn("WARNING: could not list directory %s due to an encoding error" % quote_output(localpath)) + self.warn("WARNING: could not list directory %s due to a filename encoding error" % quote_output(localpath)) children = [] for child in self.options.filter_listdir(children): @@ -297,7 +296,7 @@ class BackerUpper: if must_upload: self.verboseprint("uploading %s.." % quote_output(childpath)) - infileobj = open_expanduser(childpath, "rb") + infileobj = open(childpath, "rb") url = self.options['node-url'] + "uri" resp = do_http("PUT", url, infileobj) if resp.status not in (200, 201): diff --git a/src/allmydata/scripts/tahoe_cp.py b/src/allmydata/scripts/tahoe_cp.py index 76fa95ef..5ef9de14 100644 --- a/src/allmydata/scripts/tahoe_cp.py +++ b/src/allmydata/scripts/tahoe_cp.py @@ -8,9 +8,8 @@ from allmydata.scripts.common import get_alias, escape_path, \ DefaultAliasMarker, TahoeError from allmydata.scripts.common_http import do_http, HTTPError from allmydata import uri -from allmydata.util.encodingutil import unicode_to_url, quote_output, to_str from allmydata.util import fileutil -from allmydata.util.fileutil import open_expanduser, abspath_expanduser +from allmydata.util.encodingutil import unicode_to_url, listdir_unicode, quote_output, to_str from allmydata.util.assertutil import precondition @@ -68,7 +67,7 @@ class LocalFileSource: return True def open(self, caps_only): - return open_expanduser(self.pathname, "rb") + return open(os.path.expanduser(self.pathname), "rb") class LocalFileTarget: @@ -101,7 +100,7 @@ class LocalDirectorySource: if self.children is not None: return self.children = {} - children = os.listdir(self.pathname) + children = listdir_unicode(self.pathname) for i,n in enumerate(children): self.progressfunc("examining %d of %d" % (i, len(children))) pn = os.path.join(self.pathname, n) @@ -129,7 +128,7 @@ class LocalDirectoryTarget: if self.children is not None: return self.children = {} - children = os.listdir(self.pathname) + children = listdir_unicode(self.pathname) for i,n in enumerate(children): self.progressfunc("examining %d of %d" % (i, len(children))) n = unicode(n) @@ -512,7 +511,7 @@ class Copier: rootcap, path = get_alias(self.aliases, destination_spec, None) if rootcap == DefaultAliasMarker: # no alias, so this is a local file - pathname = abspath_expanduser(path.decode('utf-8')) + pathname = os.path.abspath(os.path.expanduser(path.decode('utf-8'))) if not os.path.exists(pathname): t = LocalMissingTarget(pathname) elif os.path.isdir(pathname): @@ -552,7 +551,7 @@ class Copier: rootcap, path = get_alias(self.aliases, source_spec, None) if rootcap == DefaultAliasMarker: # no alias, so this is a local file - pathname = abspath_expanduser(path.decode('utf-8')) + pathname = os.path.abspath(os.path.expanduser(path.decode('utf-8'))) name = os.path.basename(pathname) if not os.path.exists(pathname): raise MissingSourceError(source_spec) diff --git a/src/allmydata/scripts/tahoe_get.py b/src/allmydata/scripts/tahoe_get.py index 63032ed4..280d8c05 100644 --- a/src/allmydata/scripts/tahoe_get.py +++ b/src/allmydata/scripts/tahoe_get.py @@ -1,9 +1,8 @@ -import urllib +import os, urllib from allmydata.scripts.common import get_alias, DEFAULT_ALIAS, escape_path, \ UnknownAliasError from allmydata.scripts.common_http import do_http, format_http_error -from allmydata.util.fileutil import open_expanduser def get(options): nodeurl = options['node-url'] @@ -27,7 +26,7 @@ def get(options): resp = do_http("GET", url) if resp.status in (200, 201,): if to_file: - outf = open_expanduser(to_file, "wb") + outf = open(os.path.expanduser(to_file), "wb") else: outf = stdout while True: diff --git a/src/allmydata/scripts/tahoe_put.py b/src/allmydata/scripts/tahoe_put.py index d646110e..eb578bec 100644 --- a/src/allmydata/scripts/tahoe_put.py +++ b/src/allmydata/scripts/tahoe_put.py @@ -1,11 +1,11 @@ +import os from cStringIO import StringIO import urllib from allmydata.scripts.common_http import do_http, format_http_success, format_http_error from allmydata.scripts.common import get_alias, DEFAULT_ALIAS, escape_path, \ UnknownAliasError from allmydata.util.encodingutil import quote_output -from allmydata.util.fileutil import open_expanduser def put(options): """ @@ -65,7 +65,7 @@ def put(options): if mutable: url += "?mutable=true" if from_file: - infileobj = open_expanduser(from_file, "rb") + infileobj = open(os.path.expanduser(from_file), "rb") else: # do_http() can't use stdin directly: for one thing, we need a # Content-Length field. So we currently must copy it. diff --git a/src/allmydata/test/test_backupdb.py b/src/allmydata/test/test_backupdb.py index 1d6d9734..6cd4ffa2 100644 --- a/src/allmydata/test/test_backupdb.py +++ b/src/allmydata/test/test_backupdb.py @@ -4,7 +4,7 @@ from StringIO import StringIO from twisted.trial import unittest from allmydata.util import fileutil -from allmydata.util.encodingutil import get_filesystem_encoding, unicode_platform +from allmydata.util.encodingutil import listdir_unicode, get_filesystem_encoding, unicode_platform from allmydata.util.assertutil import precondition from allmydata.scripts import backupdb @@ -249,7 +249,7 @@ class BackupDB(unittest.TestCase): self.failUnless(bdb) self.writeto(u"f\u00f6\u00f6.txt", "foo.txt") - files = [fn for fn in os.listdir(unicode(basedir)) if fn.endswith(".txt")] + files = [fn for fn in listdir_unicode(unicode(basedir)) if fn.endswith(".txt")] self.failUnlessEqual(len(files), 1) foo_fn = os.path.join(basedir, files[0]) #print foo_fn, type(foo_fn) diff --git a/src/allmydata/test/test_cli.py b/src/allmydata/test/test_cli.py index f52e468e..6ee4d1fe 100644 --- a/src/allmydata/test/test_cli.py +++ b/src/allmydata/test/test_cli.py @@ -31,8 +31,8 @@ from twisted.internet import threads # CLI tests use deferToThread from twisted.python import usage from allmydata.util.assertutil import precondition -from allmydata.util.encodingutil import unicode_platform, quote_output, \ - get_output_encoding, get_argv_encoding, get_filesystem_encoding, \ +from allmydata.util.encodingutil import listdir_unicode, unicode_platform, \ + quote_output, get_output_encoding, get_argv_encoding, get_filesystem_encoding, \ unicode_to_output, to_str timeout = 480 # deep_check takes 360s on Zandr's linksys box, others take > 240s @@ -441,7 +441,7 @@ class CLI(CLITestMixin, unittest.TestCase): for name in filenames: open(os.path.join(unicode(basedir), name), "wb").close() - for file in os.listdir(unicode(basedir)): + for file in listdir_unicode(unicode(basedir)): self.failUnlessIn(normalize(file), filenames) diff --git a/src/allmydata/test/test_encodingutil.py b/src/allmydata/test/test_encodingutil.py index 10310ca6..9a3c5f33 100644 --- a/src/allmydata/test/test_encodingutil.py +++ b/src/allmydata/test/test_encodingutil.py @@ -13,7 +13,9 @@ TEST_FILENAMES = ( # systems. if __name__ == "__main__": - import sys + import sys, os + import tempfile + import shutil import platform if len(sys.argv) != 2: @@ -21,7 +23,7 @@ if __name__ == "__main__": sys.exit(1) print - print "class MyWeirdOS(StringUtils, unittest.TestCase):" + print "class MyWeirdOS(EncodingUtil, unittest.TestCase):" print " uname = '%s'" % ' '.join(platform.uname()) if sys.platform != "win32": print " argv = %s" % repr(sys.argv[1]) @@ -29,21 +31,39 @@ if __name__ == "__main__": print " filesystem_encoding = '%s'" % sys.getfilesystemencoding() print " output_encoding = '%s'" % sys.stdout.encoding print " argv_encoding = '%s'" % (sys.platform == "win32" and 'ascii' or sys.stdout.encoding) + + try: + tmpdir = tempfile.mkdtemp() + for fname in TEST_FILENAMES: + open(os.path.join(tmpdir, fname), 'w').close() + + # Use Unicode API under Windows or MacOS X + if sys.platform in ('win32', 'darwin'): + dirlist = os.listdir(unicode(tmpdir)) + else: + dirlist = os.listdir(tmpdir) + + print " dirlist = %s" % repr(dirlist) + except: + print " # Oops, I cannot write filenames containing non-ascii characters" print + shutil.rmtree(tmpdir) sys.exit(0) from twisted.trial import unittest from mock import patch -import sys, locale +import os, sys, locale from allmydata.test.common_util import ReallyEqualMixin from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \ - unicode_to_output, unicode_platform, get_output_encoding, _reload + unicode_to_output, unicode_platform, listdir_unicode, FilenameEncodingError, \ + get_output_encoding, get_filesystem_encoding, _reload +from allmydata.dirnode import normalize from twisted.python import usage -class StringUtilsErrors(ReallyEqualMixin, unittest.TestCase): +class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase): def tearDown(self): _reload() @@ -103,8 +123,55 @@ class StringUtilsErrors(ReallyEqualMixin, unittest.TestCase): _reload() self.failUnlessRaises(UnicodeEncodeError, unicode_to_output, lumiere_nfc) + @patch('os.listdir') + def test_no_unicode_normalization(self, mock): + # Pretend to run on a Unicode platform. + # We normalized to NFC in 1.7beta, but we now don't. + orig_platform = sys.platform + try: + sys.platform = 'darwin' + mock.return_value = [Artonwall_nfd] + _reload() + self.failUnlessReallyEqual(listdir_unicode(u'/dummy'), [Artonwall_nfd]) + finally: + sys.platform = orig_platform -class StringUtils(ReallyEqualMixin): +# The following tests apply only to platforms that don't store filenames as +# Unicode entities on the filesystem. +class EncodingUtilNonUnicodePlatform(unittest.TestCase): + def setUp(self): + # Mock sys.platform because unicode_platform() uses it + self.original_platform = sys.platform + sys.platform = 'linux' + + def tearDown(self): + sys.platform = self.original_platform + _reload() + + @patch('sys.getfilesystemencoding') + @patch('os.listdir') + def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding): + # What happens if latin1-encoded filenames are encountered on an UTF-8 + # filesystem? + mock_listdir.return_value = [ + lumiere_nfc.encode('utf-8'), + lumiere_nfc.encode('latin1')] + + mock_getfilesystemencoding.return_value = 'utf-8' + _reload() + self.failUnlessRaises(FilenameEncodingError, + listdir_unicode, + u'/dummy') + + # We're trying to list a directory whose name cannot be represented in + # the filesystem encoding. This should fail. + mock_getfilesystemencoding.return_value = 'ascii' + _reload() + self.failUnlessRaises(FilenameEncodingError, + listdir_unicode, + u'/' + lumiere_nfc) + +class EncodingUtil(ReallyEqualMixin): def setUp(self): # Mock sys.platform because unicode_platform() uses it self.original_platform = sys.platform @@ -148,8 +215,74 @@ class StringUtils(ReallyEqualMixin): _reload() self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform]) + @patch('sys.getfilesystemencoding') + @patch('os.listdir') + def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding): + if 'dirlist' not in dir(self): + return + + try: + u"test".encode(self.filesystem_encoding) + except (LookupError, AttributeError): + raise unittest.SkipTest("This platform does not support the '%s' filesystem encoding " + "that we are testing for the benefit of a different platform." + % (self.filesystem_encoding,)) + + mock_listdir.return_value = self.dirlist + mock_getfilesystemencoding.return_value = self.filesystem_encoding + + _reload() + filenames = listdir_unicode(u'/dummy') + + self.failUnlessEqual(set([normalize(fname) for fname in filenames]), + set(TEST_FILENAMES)) + + +class StdlibUnicode(unittest.TestCase): + """This mainly tests that some of the stdlib functions support Unicode paths, but also that + listdir_unicode works for valid filenames.""" + + def skip_if_cannot_represent_filename(self, u): + enc = get_filesystem_encoding() + if not unicode_platform(): + try: + u.encode(enc) + except UnicodeEncodeError: + raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.") + + def test_mkdir_open_exists_abspath_listdir_expanduser(self): + self.skip_if_cannot_represent_filename(lumiere_nfc) + + try: + os.mkdir(lumiere_nfc) + except EnvironmentError, e: + raise unittest.SkipTest("%r\nIt is possible that the filesystem on which this test is being run " + "does not support Unicode, even though the platform does." % (e,)) + + fn = lumiere_nfc + '/' + lumiere_nfc + '.txt' + open(fn, 'wb').close() + self.failUnless(os.path.exists(fn)) + self.failUnless(os.path.exists(os.path.abspath(fn))) + filenames = listdir_unicode(lumiere_nfc) + + # We only require that the listing includes a filename that is canonically equivalent + # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent). + self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames])) + + expanded = os.path.expanduser("~/" + lumiere_nfc) + self.failIfIn("~", expanded) + self.failUnless(expanded.endswith(lumiere_nfc), expanded) + + @patch('sys.getfilesystemencoding') + def test_open_unrepresentable(self, mock): + if unicode_platform(): + raise unittest.SkipTest("This test is not applicable to platforms that represent filenames as Unicode.") + + mock.return_value = 'ascii' + self.failUnlessRaises(UnicodeEncodeError, open, lumiere_nfc, 'rb') + -class UbuntuKarmicUTF8(StringUtils, unittest.TestCase): +class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase): uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64' output = 'lumi\xc3\xa8re' argv = 'lumi\xc3\xa8re' @@ -157,8 +290,9 @@ class UbuntuKarmicUTF8(StringUtils, unittest.TestCase): filesystem_encoding = 'UTF-8' output_encoding = 'UTF-8' argv_encoding = 'UTF-8' + dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt'] -class UbuntuKarmicLatin1(StringUtils, unittest.TestCase): +class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase): uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64' output = 'lumi\xe8re' argv = 'lumi\xe8re' @@ -166,32 +300,36 @@ class UbuntuKarmicLatin1(StringUtils, unittest.TestCase): filesystem_encoding = 'ISO-8859-1' output_encoding = 'ISO-8859-1' argv_encoding = 'ISO-8859-1' + dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3'] -class WindowsXP(StringUtils, unittest.TestCase): +class WindowsXP(EncodingUtil, unittest.TestCase): uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD' output = 'lumi\x8are' platform = 'win32' filesystem_encoding = 'mbcs' output_encoding = 'cp850' argv_encoding = 'ascii' + dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3'] -class WindowsXP_UTF8(StringUtils, unittest.TestCase): +class WindowsXP_UTF8(EncodingUtil, unittest.TestCase): uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD' output = 'lumi\xc3\xa8re' platform = 'win32' filesystem_encoding = 'mbcs' output_encoding = 'cp65001' argv_encoding = 'ascii' + dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3'] -class WindowsVista(StringUtils, unittest.TestCase): +class WindowsVista(EncodingUtil, unittest.TestCase): uname = 'Windows Vista 6.0.6000 x86 x86 Family 6 Model 15 Stepping 11, GenuineIntel' output = 'lumi\x8are' platform = 'win32' filesystem_encoding = 'mbcs' output_encoding = 'cp850' argv_encoding = 'ascii' + dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3'] -class MacOSXLeopard(StringUtils, unittest.TestCase): +class MacOSXLeopard(EncodingUtil, unittest.TestCase): uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc' output = 'lumi\xc3\xa8re' argv = 'lumi\xc3\xa8re' @@ -199,15 +337,17 @@ class MacOSXLeopard(StringUtils, unittest.TestCase): filesystem_encoding = 'utf-8' output_encoding = 'UTF-8' argv_encoding = 'UTF-8' + dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file'] -class MacOSXLeopard7bit(StringUtils, unittest.TestCase): +class MacOSXLeopard7bit(EncodingUtil, unittest.TestCase): uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc' platform = 'darwin' filesystem_encoding = 'utf-8' output_encoding = 'US-ASCII' argv_encoding = 'US-ASCII' + dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file'] -class OpenBSD(StringUtils, unittest.TestCase): +class OpenBSD(EncodingUtil, unittest.TestCase): uname = 'OpenBSD 4.1 GENERIC#187 i386 Intel(R) Celeron(R) CPU 2.80GHz ("GenuineIntel" 686-class)' platform = 'openbsd4' filesystem_encoding = '646' diff --git a/src/allmydata/util/encodingutil.py b/src/allmydata/util/encodingutil.py index 61d58fef..e5787851 100644 --- a/src/allmydata/util/encodingutil.py +++ b/src/allmydata/util/encodingutil.py @@ -4,6 +4,7 @@ unicode and back. """ import sys +import os import re from allmydata.util.assertutil import precondition from twisted.python import usage @@ -173,3 +174,44 @@ def unicode_platform(): Does the current platform handle Unicode filenames natively? """ return is_unicode_platform + +class FilenameEncodingError(Exception): + """ + Filename cannot be encoded using the current encoding of your filesystem + (%s). Please configure your locale correctly or rename this file. + """ + pass + +def listdir_unicode_fallback(path): + """ + This function emulates a fallback Unicode API similar to one available + under Windows or MacOS X. + + If badly encoded filenames are encountered, an exception is raised. + """ + precondition(isinstance(path, unicode), path) + + try: + byte_path = path.encode(filesystem_encoding) + except (UnicodeEncodeError, UnicodeDecodeError): + raise FilenameEncodingError(path) + + try: + return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)] + except UnicodeDecodeError: + raise FilenameEncodingError(fn) + +def listdir_unicode(path): + """ + Wrapper around listdir() which provides safe access to the convenient + Unicode API even under platforms that don't provide one natively. + """ + precondition(isinstance(path, unicode), path) + + # On Windows and MacOS X, the Unicode API is used + # On other platforms (ie. Unix systems), the byte-level API is used + + if is_unicode_platform: + return os.listdir(path) + else: + return listdir_unicode_fallback(path) diff --git a/src/allmydata/util/fileutil.py b/src/allmydata/util/fileutil.py index bd9deb43..46e1c9bf 100644 --- a/src/allmydata/util/fileutil.py +++ b/src/allmydata/util/fileutil.py @@ -211,7 +211,7 @@ def read(path): def put_file(pathname, inf): # TODO: create temporary file and move into place? - outf = open_expanduser(pathname, "wb") + outf = open(os.path.expanduser(pathname), "wb") try: while True: data = inf.read(32768) @@ -220,11 +220,3 @@ def put_file(pathname, inf): outf.write(data) finally: outf.close() - -def open_expanduser(path, mode): - assert isinstance(path, unicode), path - return open(os.path.expanduser(path), mode) - -def abspath_expanduser(path): - assert isinstance(path, unicode), path - return os.path.abspath(os.path.expanduser(path))