immutable repairer: populate servers-responding properly
authorBrian Warner <warner@lothar.com>
Wed, 16 May 2012 23:50:43 +0000 (16:50 -0700)
committerBrian Warner <warner@lothar.com>
Wed, 16 May 2012 23:55:09 +0000 (16:55 -0700)
If a server did not respond to the pre-repair filecheck, but did respond
to the repair, that server was not correctly added to the
RepairResults.data["servers-responding"] list. (This resulted from a
buggy usage of DictOfSets.union() in filenode.py).

In addition, servers to which filecheck queries were sent, but did not
respond, were incorrectly added to the servers-responding list
anyawys. (This resulted from code in the checker.py not paying attention
to the 'responded' flag).

The first bug was neatly masked by the second: it's pretty rare to have
a server suddenly start responding in the one-second window between a
filecheck and a subsequent repair, and if the server was around for the
filecheck, you'd never notice the problem. I only spotted the smelly
code while I was changing it for IServer cleanup purposes.

I added coverage to test_repairer.py for this. Trying to get that test
to fail before fixing the first bug is what led me to discover the
second bug. I also had to update test_corrupt_file_verno, since it was
incorrectly asserting that 10 servers responded, when in fact one of
them throws an error (but the second bug was causing it to be reported
anyways).

src/allmydata/immutable/checker.py
src/allmydata/immutable/filenode.py
src/allmydata/test/no_network.py
src/allmydata/test/test_repairer.py

index 73ebc1ec18604a68bc18eef99ea29544fff6b9c4..4b7a00590aedd5a830a67e5ef8571ec176a60a57 100644 (file)
@@ -747,6 +747,7 @@ class Checker(log.PrefixingLogMixin):
         servers = {} # {serverid: set(sharenums)}
         corruptshare_locators = [] # (serverid, storageindex, sharenum)
         incompatibleshare_locators = [] # (serverid, storageindex, sharenum)
+        servers_responding = set() # serverid
 
         for verified, server, corrupt, incompatible, responded in results:
             server_id = server.get_serverid()
@@ -757,6 +758,8 @@ class Checker(log.PrefixingLogMixin):
                 corruptshare_locators.append((server_id, SI, sharenum))
             for sharenum in incompatible:
                 incompatibleshare_locators.append((server_id, SI, sharenum))
+            if responded:
+                servers_responding.add(server_id)
 
         d['count-shares-good'] = len(verifiedshares)
         d['count-good-share-hosts'] = len([s for s in servers.keys() if servers[s]])
@@ -780,7 +783,7 @@ class Checker(log.PrefixingLogMixin):
             d['count-recoverable-versions'] = 0
             d['count-unrecoverable-versions'] = 1
 
-        d['servers-responding'] = list(servers)
+        d['servers-responding'] = list(servers_responding)
         d['sharemap'] = verifiedshares
         # no such thing as wrong shares of an immutable file
         d['count-wrong-shares'] = 0
index 73f7173e3db7e5fbf98456ef19a7bcd90bbc3dce..c7fa82f6a4193bff7bdb702b3807f49ec7953fc5 100644 (file)
@@ -119,8 +119,10 @@ class CiphertextFileNode:
                     assert isinstance(sm, DictOfSets), sm
                     sm.update(ur.sharemap)
                     servers_responding = set(prr.data['servers-responding'])
-                    servers_responding.union(ur.sharemap.iterkeys())
-                    prr.data['servers-responding'] = list(servers_responding)
+                    for shnum, serverids in ur.sharemap.items():
+                        servers_responding.update(serverids)
+                    servers_responding = sorted(servers_responding)
+                    prr.data['servers-responding'] = servers_responding
                     prr.data['count-shares-good'] = len(sm)
                     good_hosts = len(reduce(set.union, sm.itervalues(), set()))
                     prr.data['count-good-share-hosts'] = good_hosts
index 82ada09710eb013bb83848b52763ee00be9a79aa..08ef42770c04bfe3788eca967f6ba43c34d1d5ea 100644 (file)
@@ -67,6 +67,8 @@ class LocalWrapper:
 
         def _call():
             if self.broken:
+                if self.broken is not True: # a counter, not boolean
+                    self.broken -= 1
                 raise IntentionalError("I was asked to break")
             if self.hung_until:
                 d2 = defer.Deferred()
@@ -299,10 +301,11 @@ class NoNetworkGrid(service.MultiService):
         self.rebuild_serverlist()
         return ss
 
-    def break_server(self, serverid):
+    def break_server(self, serverid, count=True):
         # mark the given server as broken, so it will throw exceptions when
-        # asked to hold a share or serve a share
-        self.wrappers_by_id[serverid].broken = True
+        # asked to hold a share or serve a share. If count= is a number,
+        # throw that many exceptions before starting to work again.
+        self.wrappers_by_id[serverid].broken = count
 
     def hang_server(self, serverid):
         # hang the given server
index 94cd9ee4b39c1e752dbaa0f3cff9b35a3063f7ed..168d84c2e5c4536a330587b5efed59765e6e24f0 100644 (file)
@@ -170,7 +170,7 @@ class Verifier(GridTestMixin, unittest.TestCase, RepairTestMixin):
         self.failUnless(data['count-shares-needed'] == 3, data)
         self.failUnless(data['count-shares-expected'] == 10, data)
         self.failUnless(data['count-good-share-hosts'] == 9, data)
-        self.failUnless(len(data['servers-responding']) == 10, data)
+        self.failUnless(len(data['servers-responding']) == 9, data)
         self.failUnless(len(data['list-corrupt-shares']) == 0, data)
 
     def test_corrupt_file_verno(self):
@@ -706,6 +706,27 @@ class Repairer(GridTestMixin, unittest.TestCase, RepairTestMixin,
         d.addCallback(_check)
         return d
 
+    def test_servers_responding(self):
+        self.basedir = "repairer/Repairer/servers_responding"
+        self.set_up_grid(num_clients=2)
+        d = self.upload_and_stash()
+        # now cause one of the servers to not respond during the pre-repair
+        # filecheck, but then *do* respond to the post-repair filecheck
+        def _then(ign):
+            ss = self.g.servers_by_number[0]
+            self.g.break_server(ss.my_nodeid, count=1)
+            self.delete_shares_numbered(self.uri, [9])
+            return self.c0_filenode.check_and_repair(Monitor())
+        d.addCallback(_then)
+        def _check(rr):
+            # this exercises a bug in which the servers-responding list did
+            # not include servers that responded to the Repair, but which did
+            # not respond to the pre-repair filecheck
+            prr = rr.get_post_repair_results()
+            expected = set(self.g.get_all_serverids())
+            self.failUnlessEqual(expected, set(prr.data["servers-responding"]))
+        d.addCallback(_check)
+        return d
 
 # XXX extend these tests to show that the checker detects which specific
 # share on which specific server is broken -- this is necessary so that the