From: Brian Warner Date: Wed, 10 Dec 2008 05:44:49 +0000 (-0700) Subject: mutable publish: if we are surprised by shares that match what we would have written... X-Git-Url: https://git.rkrishnan.org/%5B/%5D%20/reliability?a=commitdiff_plain;h=278c47b9bd859343079d4e0d9718b336b624d0b1;p=tahoe-lafs%2Ftahoe-lafs.git mutable publish: if we are surprised by shares that match what we would have written anyways, don't be surprised. This should fix one of the two #546 problems, in which we re-use a server and forget that we already sent them a share. --- diff --git a/src/allmydata/mutable/publish.py b/src/allmydata/mutable/publish.py index 59460c4f..345581bb 100644 --- a/src/allmydata/mutable/publish.py +++ b/src/allmydata/mutable/publish.py @@ -386,13 +386,14 @@ class Publish: i = 0 for shnum in homeless_shares: (ignored1, ignored2, peerid, ss) = peerlist[i] - # TODO: if we are forced to send a share to a server that already - # has one, we may have two write requests in flight, and the + # if we are forced to send a share to a server that already has + # one, we may have two write requests in flight, and the # servermap (which was computed before either request was sent) - # won't reflect the new shares, so the second response will cause - # us to be surprised ("unexpected share on peer"), causing the - # publish to fail with an UncoordinatedWriteError. This is - # troublesome but not really a bit problem. Fix it at some point. + # won't reflect the new shares, so the second response will be + # surprising. There is code in _got_write_answer() to tolerate + # this, otherwise it would cause the publish to fail with an + # UncoordinatedWriteError. See #546 for details of the trouble + # this used to cause. self.goal.add( (peerid, shnum) ) self.connections[peerid] = ss i += 1 @@ -483,6 +484,7 @@ class Publish: root_hash = share_hash_tree[0] assert len(root_hash) == 32 self.log("my new root_hash is %s" % base32.b2a(root_hash)) + self._new_version_info = (self._new_seqnum, root_hash, self.salt) prefix = pack_prefix(self._new_seqnum, root_hash, self.salt, self.required_shares, self.total_shares, @@ -672,7 +674,67 @@ class Publish: wrote, read_data = answer surprise_shares = set(read_data.keys()) - set(shnums) - if surprise_shares: + + surprised = False + for shnum in surprise_shares: + # read_data is a dict mapping shnum to checkstring (SIGNED_PREFIX) + checkstring = read_data[shnum][0] + their_version_info = unpack_checkstring(checkstring) + if their_version_info == self._new_version_info: + # they have the right share, somehow + + if (peerid,shnum) in self.goal: + # and we want them to have it, so we probably sent them a + # copy in an earlier write. This is ok, and avoids the + # #546 problem. + continue + + # They aren't in our goal, but they are still for the right + # version. Somebody else wrote them, and it's a convergent + # uncoordinated write. Pretend this is ok (don't be + # surprised), since I suspect there's a decent chance that + # we'll hit this in normal operation. + continue + + else: + # the new shares are of a different version + if peerid in self._servermap.reachable_peers: + # we asked them about their shares, so we had knowledge + # of what they used to have. Any surprising shares must + # have come from someone else, so UCW. + surprised = True + else: + # we didn't ask them, and now we've discovered that they + # have a share we didn't know about. This indicates that + # mapupdate should have wokred harder and asked more + # servers before concluding that it knew about them all. + + # signal UCW, but make sure to ask this peer next time, + # so we'll remember to update it if/when we retry. + surprised = True + # TODO: ask this peer next time. I don't yet have a good + # way to do this. Two insufficient possibilities are: + # + # self._servermap.add_new_share(peerid, shnum, verinfo, now) + # but that requires fetching/validating/parsing the whole + # version string, and all we have is the checkstring + # self._servermap.mark_bad_share(peerid, shnum, checkstring) + # that will make publish overwrite the share next time, + # but it won't re-query the server, and it won't make + # mapupdate search further + + # TODO later: when publish starts, do + # servermap.get_best_version(), extract the seqnum, + # subtract one, and store as highest-replaceable-seqnum. + # Then, if this surprise-because-we-didn't-ask share is + # of highest-replaceable-seqnum or lower, we're allowed + # to replace it: send out a new writev (or rather add it + # to self.goal and loop). + pass + + surprised = True + + if surprised: self.log("they had shares %s that we didn't know about" % (list(surprise_shares),), parent=lp, level=log.WEIRD, umid="un9CSQ") diff --git a/src/allmydata/test/test_mutable.py b/src/allmydata/test/test_mutable.py index d2cc36a6..329912bc 100644 --- a/src/allmydata/test/test_mutable.py +++ b/src/allmydata/test/test_mutable.py @@ -1959,6 +1959,38 @@ class Problems(unittest.TestCase, testutil.ShouldFailMixin): d.addCallback(lambda res: self.failUnlessEqual(res, "contents 2")) return d + def test_bad_server_overlap(self): + # like test_bad_server, but with no extra unused servers to fall back + # upon. This means that we must re-use a server which we've already + # used. If we don't remember the fact that we sent them one share + # already, we'll mistakenly think we're experiencing an + # UncoordinatedWriteError. + + # Break one server, then create the file: the initial publish should + # complete with an alternate server. Breaking a second server should + # not prevent an update from succeeding either. + basedir = os.path.join("mutable/CollidingWrites/test_bad_server") + self.client = LessFakeClient(basedir, 10) + + peerids = sorted(self.client._connections.keys()) + self.client._connections[peerids[0]].broken = True + + d = self.client.create_mutable_file("contents 1") + def _created(n): + d = n.download_best_version() + d.addCallback(lambda res: self.failUnlessEqual(res, "contents 1")) + # now break one of the remaining servers + def _break_second_server(res): + self.client._connections[peerids[1]].broken = True + d.addCallback(_break_second_server) + d.addCallback(lambda res: n.overwrite("contents 2")) + # that ought to work too + d.addCallback(lambda res: n.download_best_version()) + d.addCallback(lambda res: self.failUnlessEqual(res, "contents 2")) + return d + d.addCallback(_created) + return d + def test_publish_all_servers_bad(self): # Break all servers: the publish should fail basedir = os.path.join("mutable/CollidingWrites/publish_all_servers_bad")