Thu May 17 00:55:09 BST 2012 Brian Warner <warner@lothar.com>
authorDaira Hopwood <daira@jacaranda.org>
Thu, 5 Sep 2013 16:54:10 +0000 (17:54 +0100)
committerDaira Hopwood <daira@jacaranda.org>
Thu, 5 Sep 2013 16:54:10 +0000 (17:54 +0100)
  * immutable repairer: populate servers-responding properly

  If a server did not respond to the pre-repair filecheck, but did respond
  to the repair, that server was not correctly added to the
  RepairResults.data["servers-responding"] list. (This resulted from a
  buggy usage of DictOfSets.union() in filenode.py).

  In addition, servers to which filecheck queries were sent, but did not
  respond, were incorrectly added to the servers-responding list
  anyawys. (This resulted from code in the checker.py not paying attention
  to the 'responded' flag).

  The first bug was neatly masked by the second: it's pretty rare to have
  a server suddenly start responding in the one-second window between a
  filecheck and a subsequent repair, and if the server was around for the
  filecheck, you'd never notice the problem. I only spotted the smelly
  code while I was changing it for IServer cleanup purposes.

  I added coverage to test_repairer.py for this. Trying to get that test
  to fail before fixing the first bug is what led me to discover the
  second bug. I also had to update test_corrupt_file_verno, since it was
  incorrectly asserting that 10 servers responded, when in fact one of
  them throws an error (but the second bug was causing it to be reported
  anyways).

src/allmydata/immutable/checker.py
src/allmydata/immutable/filenode.py
src/allmydata/test/no_network.py
src/allmydata/test/test_repairer.py

index 73ebc1ec18604a68bc18eef99ea29544fff6b9c4..4b7a00590aedd5a830a67e5ef8571ec176a60a57 100644 (file)
@@ -747,6 +747,7 @@ class Checker(log.PrefixingLogMixin):
         servers = {} # {serverid: set(sharenums)}
         corruptshare_locators = [] # (serverid, storageindex, sharenum)
         incompatibleshare_locators = [] # (serverid, storageindex, sharenum)
+        servers_responding = set() # serverid
 
         for verified, server, corrupt, incompatible, responded in results:
             server_id = server.get_serverid()
@@ -757,6 +758,8 @@ class Checker(log.PrefixingLogMixin):
                 corruptshare_locators.append((server_id, SI, sharenum))
             for sharenum in incompatible:
                 incompatibleshare_locators.append((server_id, SI, sharenum))
+            if responded:
+                servers_responding.add(server_id)
 
         d['count-shares-good'] = len(verifiedshares)
         d['count-good-share-hosts'] = len([s for s in servers.keys() if servers[s]])
@@ -780,7 +783,7 @@ class Checker(log.PrefixingLogMixin):
             d['count-recoverable-versions'] = 0
             d['count-unrecoverable-versions'] = 1
 
-        d['servers-responding'] = list(servers)
+        d['servers-responding'] = list(servers_responding)
         d['sharemap'] = verifiedshares
         # no such thing as wrong shares of an immutable file
         d['count-wrong-shares'] = 0
index ba731cd2eaaad06546eac084baf013744b565f65..7a3a991912c8fe2ca811add9e151cd1fbd756b3d 100644 (file)
@@ -119,8 +119,10 @@ class CiphertextFileNode:
                     assert isinstance(sm, DictOfSets), sm
                     sm.update(ur.sharemap)
                     servers_responding = set(prr.data['servers-responding'])
-                    servers_responding.union(ur.sharemap.iterkeys())
-                    prr.data['servers-responding'] = list(servers_responding)
+                    for shnum, serverids in ur.sharemap.items():
+                        servers_responding.update(serverids)
+                    servers_responding = sorted(servers_responding)
+                    prr.data['servers-responding'] = servers_responding
                     prr.data['count-shares-good'] = len(sm)
                     prr.data['count-good-share-hosts'] = len(sm)
                     is_healthy = bool(len(sm) >= verifycap.total_shares)
index bb9c8f2b98d8f0a77fc5602d86216bc520de7e00..ffb6708a41ab106ecbfa6076ebe0e77c4e965388 100644 (file)
@@ -67,6 +67,8 @@ class LocalWrapper:
 
         def _call():
             if self.broken:
+                if self.broken is not True: # a counter, not boolean
+                    self.broken -= 1
                 raise IntentionalError("I was asked to break")
             if self.hung_until:
                 d2 = defer.Deferred()
@@ -290,10 +292,11 @@ class NoNetworkGrid(service.MultiService):
         del self.proxies_by_id[serverid]
         self.rebuild_serverlist()
 
-    def break_server(self, serverid):
+    def break_server(self, serverid, count=True):
         # mark the given server as broken, so it will throw exceptions when
-        # asked to hold a share or serve a share
-        self.wrappers_by_id[serverid].broken = True
+        # asked to hold a share or serve a share. If count= is a number,
+        # throw that many exceptions before starting to work again.
+        self.wrappers_by_id[serverid].broken = count
 
     def hang_server(self, serverid):
         # hang the given server
index 94cd9ee4b39c1e752dbaa0f3cff9b35a3063f7ed..168d84c2e5c4536a330587b5efed59765e6e24f0 100644 (file)
@@ -170,7 +170,7 @@ class Verifier(GridTestMixin, unittest.TestCase, RepairTestMixin):
         self.failUnless(data['count-shares-needed'] == 3, data)
         self.failUnless(data['count-shares-expected'] == 10, data)
         self.failUnless(data['count-good-share-hosts'] == 9, data)
-        self.failUnless(len(data['servers-responding']) == 10, data)
+        self.failUnless(len(data['servers-responding']) == 9, data)
         self.failUnless(len(data['list-corrupt-shares']) == 0, data)
 
     def test_corrupt_file_verno(self):
@@ -706,6 +706,27 @@ class Repairer(GridTestMixin, unittest.TestCase, RepairTestMixin,
         d.addCallback(_check)
         return d
 
+    def test_servers_responding(self):
+        self.basedir = "repairer/Repairer/servers_responding"
+        self.set_up_grid(num_clients=2)
+        d = self.upload_and_stash()
+        # now cause one of the servers to not respond during the pre-repair
+        # filecheck, but then *do* respond to the post-repair filecheck
+        def _then(ign):
+            ss = self.g.servers_by_number[0]
+            self.g.break_server(ss.my_nodeid, count=1)
+            self.delete_shares_numbered(self.uri, [9])
+            return self.c0_filenode.check_and_repair(Monitor())
+        d.addCallback(_then)
+        def _check(rr):
+            # this exercises a bug in which the servers-responding list did
+            # not include servers that responded to the Repair, but which did
+            # not respond to the pre-repair filecheck
+            prr = rr.get_post_repair_results()
+            expected = set(self.g.get_all_serverids())
+            self.failUnlessEqual(expected, set(prr.data["servers-responding"]))
+        d.addCallback(_check)
+        return d
 
 # XXX extend these tests to show that the checker detects which specific
 # share on which specific server is broken -- this is necessary so that the