From: Brian Warner Date: Wed, 16 May 2012 23:50:43 +0000 (-0700) Subject: immutable repairer: populate servers-responding properly X-Git-Url: https://git.rkrishnan.org/%5B/frontends/flags/index.php?a=commitdiff_plain;h=9acf5beebd41320de6dbf5e236244dda3349d1f8;p=tahoe-lafs%2Ftahoe-lafs.git immutable repairer: populate servers-responding properly If a server did not respond to the pre-repair filecheck, but did respond to the repair, that server was not correctly added to the RepairResults.data["servers-responding"] list. (This resulted from a buggy usage of DictOfSets.union() in filenode.py). In addition, servers to which filecheck queries were sent, but did not respond, were incorrectly added to the servers-responding list anyawys. (This resulted from code in the checker.py not paying attention to the 'responded' flag). The first bug was neatly masked by the second: it's pretty rare to have a server suddenly start responding in the one-second window between a filecheck and a subsequent repair, and if the server was around for the filecheck, you'd never notice the problem. I only spotted the smelly code while I was changing it for IServer cleanup purposes. I added coverage to test_repairer.py for this. Trying to get that test to fail before fixing the first bug is what led me to discover the second bug. I also had to update test_corrupt_file_verno, since it was incorrectly asserting that 10 servers responded, when in fact one of them throws an error (but the second bug was causing it to be reported anyways). --- diff --git a/src/allmydata/immutable/checker.py b/src/allmydata/immutable/checker.py index 73ebc1ec..4b7a0059 100644 --- a/src/allmydata/immutable/checker.py +++ b/src/allmydata/immutable/checker.py @@ -747,6 +747,7 @@ class Checker(log.PrefixingLogMixin): servers = {} # {serverid: set(sharenums)} corruptshare_locators = [] # (serverid, storageindex, sharenum) incompatibleshare_locators = [] # (serverid, storageindex, sharenum) + servers_responding = set() # serverid for verified, server, corrupt, incompatible, responded in results: server_id = server.get_serverid() @@ -757,6 +758,8 @@ class Checker(log.PrefixingLogMixin): corruptshare_locators.append((server_id, SI, sharenum)) for sharenum in incompatible: incompatibleshare_locators.append((server_id, SI, sharenum)) + if responded: + servers_responding.add(server_id) d['count-shares-good'] = len(verifiedshares) d['count-good-share-hosts'] = len([s for s in servers.keys() if servers[s]]) @@ -780,7 +783,7 @@ class Checker(log.PrefixingLogMixin): d['count-recoverable-versions'] = 0 d['count-unrecoverable-versions'] = 1 - d['servers-responding'] = list(servers) + d['servers-responding'] = list(servers_responding) d['sharemap'] = verifiedshares # no such thing as wrong shares of an immutable file d['count-wrong-shares'] = 0 diff --git a/src/allmydata/immutable/filenode.py b/src/allmydata/immutable/filenode.py index 73f7173e..c7fa82f6 100644 --- a/src/allmydata/immutable/filenode.py +++ b/src/allmydata/immutable/filenode.py @@ -119,8 +119,10 @@ class CiphertextFileNode: assert isinstance(sm, DictOfSets), sm sm.update(ur.sharemap) servers_responding = set(prr.data['servers-responding']) - servers_responding.union(ur.sharemap.iterkeys()) - prr.data['servers-responding'] = list(servers_responding) + for shnum, serverids in ur.sharemap.items(): + servers_responding.update(serverids) + servers_responding = sorted(servers_responding) + prr.data['servers-responding'] = servers_responding prr.data['count-shares-good'] = len(sm) good_hosts = len(reduce(set.union, sm.itervalues(), set())) prr.data['count-good-share-hosts'] = good_hosts diff --git a/src/allmydata/test/no_network.py b/src/allmydata/test/no_network.py index 82ada097..08ef4277 100644 --- a/src/allmydata/test/no_network.py +++ b/src/allmydata/test/no_network.py @@ -67,6 +67,8 @@ class LocalWrapper: def _call(): if self.broken: + if self.broken is not True: # a counter, not boolean + self.broken -= 1 raise IntentionalError("I was asked to break") if self.hung_until: d2 = defer.Deferred() @@ -299,10 +301,11 @@ class NoNetworkGrid(service.MultiService): self.rebuild_serverlist() return ss - def break_server(self, serverid): + def break_server(self, serverid, count=True): # mark the given server as broken, so it will throw exceptions when - # asked to hold a share or serve a share - self.wrappers_by_id[serverid].broken = True + # asked to hold a share or serve a share. If count= is a number, + # throw that many exceptions before starting to work again. + self.wrappers_by_id[serverid].broken = count def hang_server(self, serverid): # hang the given server diff --git a/src/allmydata/test/test_repairer.py b/src/allmydata/test/test_repairer.py index 94cd9ee4..168d84c2 100644 --- a/src/allmydata/test/test_repairer.py +++ b/src/allmydata/test/test_repairer.py @@ -170,7 +170,7 @@ class Verifier(GridTestMixin, unittest.TestCase, RepairTestMixin): self.failUnless(data['count-shares-needed'] == 3, data) self.failUnless(data['count-shares-expected'] == 10, data) self.failUnless(data['count-good-share-hosts'] == 9, data) - self.failUnless(len(data['servers-responding']) == 10, data) + self.failUnless(len(data['servers-responding']) == 9, data) self.failUnless(len(data['list-corrupt-shares']) == 0, data) def test_corrupt_file_verno(self): @@ -706,6 +706,27 @@ class Repairer(GridTestMixin, unittest.TestCase, RepairTestMixin, d.addCallback(_check) return d + def test_servers_responding(self): + self.basedir = "repairer/Repairer/servers_responding" + self.set_up_grid(num_clients=2) + d = self.upload_and_stash() + # now cause one of the servers to not respond during the pre-repair + # filecheck, but then *do* respond to the post-repair filecheck + def _then(ign): + ss = self.g.servers_by_number[0] + self.g.break_server(ss.my_nodeid, count=1) + self.delete_shares_numbered(self.uri, [9]) + return self.c0_filenode.check_and_repair(Monitor()) + d.addCallback(_then) + def _check(rr): + # this exercises a bug in which the servers-responding list did + # not include servers that responded to the Repair, but which did + # not respond to the pre-repair filecheck + prr = rr.get_post_repair_results() + expected = set(self.g.get_all_serverids()) + self.failUnlessEqual(expected, set(prr.data["servers-responding"])) + d.addCallback(_check) + return d # XXX extend these tests to show that the checker detects which specific # share on which specific server is broken -- this is necessary so that the