From: Brian Warner Date: Wed, 4 Aug 2010 07:27:41 +0000 (-0700) Subject: ShareFinder: add 10s OVERDUE timer, send new requests to replace overdue ones X-Git-Tag: allmydata-tahoe-1.8.0b2~17 X-Git-Url: https://git.rkrishnan.org/components/com_hotproperty/reliability?a=commitdiff_plain;h=20847dd8768a1622a63abfc73ec0bad0d5e594ff;p=tahoe-lafs%2Ftahoe-lafs.git ShareFinder: add 10s OVERDUE timer, send new requests to replace overdue ones The fixed 10-second timer will eventually be replaced with a per-server value, calculated based on observed response times. test_hung_server.py: enhance to exercise DYHB=OVERDUE state. Split existing mutable+immutable tests into two pieces for clarity. Reenabled several tests. Deleted the now-obsolete "test_failover_during_stage_4". --- diff --git a/src/allmydata/immutable/downloader/finder.py b/src/allmydata/immutable/downloader/finder.py index 1a2a94da..9adee99c 100644 --- a/src/allmydata/immutable/downloader/finder.py +++ b/src/allmydata/immutable/downloader/finder.py @@ -3,6 +3,7 @@ import time now = time.time from foolscap.api import eventually from allmydata.util import base32, log, idlib +from twisted.internet import reactor from share import Share, CommonShare @@ -24,6 +25,8 @@ class RequestToken: self.peerid = peerid class ShareFinder: + OVERDUE_TIMEOUT = 10.0 + def __init__(self, storage_broker, verifycap, node, download_status, logparent=None, max_outstanding_requests=10): self.running = True # stopped by Share.stop, from Terminator @@ -38,6 +41,8 @@ class ShareFinder: self._commonshares = {} # shnum to CommonShare instance self.undelivered_shares = [] self.pending_requests = set() + self.overdue_requests = set() # subset of pending_requests + self.overdue_timers = {} self._storage_index = verifycap.storage_index self._si_prefix = base32.b2a_l(self._storage_index[:8], 60) @@ -64,6 +69,9 @@ class ShareFinder: def stop(self): self.running = False + while self.overdue_timers: + req,t = self.overdue_timers.popitem() + t.cancel() # called by our parent CiphertextDownloader def hungry(self): @@ -100,7 +108,8 @@ class ShareFinder: eventually(self.share_consumer.got_shares, [sh]) return - if len(self.pending_requests) >= self.max_outstanding_requests: + non_overdue = self.pending_requests - self.overdue_requests + if len(non_overdue) >= self.max_outstanding_requests: # cannot send more requests, must wait for some to retire return @@ -138,8 +147,11 @@ class ShareFinder: peerid=idlib.shortnodeid_b2a(peerid), level=log.NOISY, umid="Io7pyg") d_ev = self._download_status.add_dyhb_sent(peerid, now()) + # TODO: get the timer from a Server object, it knows best + self.overdue_timers[req] = reactor.callLater(self.OVERDUE_TIMEOUT, + self.overdue, req) d = rref.callRemote("get_buckets", self._storage_index) - d.addBoth(incidentally, self.pending_requests.discard, req) + d.addBoth(incidentally, self._request_retired, req) d.addCallbacks(self._got_response, self._got_error, callbackArgs=(rref.version, peerid, req, d_ev, lp), errbackArgs=(peerid, req, d_ev, lp)) @@ -147,6 +159,19 @@ class ShareFinder: level=log.WEIRD, parent=lp, umid="rpdV0w") d.addCallback(incidentally, eventually, self.loop) + def _request_retired(self, req): + self.pending_requests.discard(req) + self.overdue_requests.discard(req) + if req in self.overdue_timers: + self.overdue_timers[req].cancel() + del self.overdue_timers[req] + + def overdue(self, req): + del self.overdue_timers[req] + assert req in self.pending_requests # paranoia, should never be false + self.overdue_requests.add(req) + eventually(self.loop) + def _got_response(self, buckets, server_version, peerid, req, d_ev, lp): shnums = sorted([shnum for shnum in buckets]) d_ev.finished(shnums, now()) diff --git a/src/allmydata/test/test_hung_server.py b/src/allmydata/test/test_hung_server.py index cef005ab..8856ce22 100644 --- a/src/allmydata/test/test_hung_server.py +++ b/src/allmydata/test/test_hung_server.py @@ -10,12 +10,14 @@ from allmydata.mutable.common import UnrecoverableFileError from allmydata.storage.common import storage_index_to_dir from allmydata.test.no_network import GridTestMixin from allmydata.test.common import ShouldFailMixin, _corrupt_share_data +from allmydata.util.pollmixin import PollMixin from allmydata.interfaces import NotEnoughSharesError immutable_plaintext = "data" * 10000 mutable_plaintext = "muta" * 10000 -class HungServerDownloadTest(GridTestMixin, ShouldFailMixin, unittest.TestCase): +class HungServerDownloadTest(GridTestMixin, ShouldFailMixin, PollMixin, + unittest.TestCase): # Many of these tests take around 60 seconds on François's ARM buildslave: # http://tahoe-lafs.org/buildbot/builders/FranXois%20lenny-armv5tel # allmydata.test.test_hung_server.HungServerDownloadTest.test_2_good_8_broken_duplicate_share_fail once ERRORed after 197 seconds on Midnight Magic's NetBSD buildslave: @@ -23,7 +25,6 @@ class HungServerDownloadTest(GridTestMixin, ShouldFailMixin, unittest.TestCase): # MM's buildslave varies a lot in how long it takes to run tests. timeout = 240 - skip="not ready" def _break(self, servers): for (id, ss) in servers: @@ -37,6 +38,15 @@ class HungServerDownloadTest(GridTestMixin, ShouldFailMixin, unittest.TestCase): for (id, ss) in servers: self.g.unhang_server(id, **kwargs) + def _hang_shares(self, shnums, **kwargs): + # hang all servers who are holding the given shares + hung_serverids = set() + for (i_shnum, i_serverid, i_sharefile) in self.shares: + if i_shnum in shnums: + if i_serverid not in hung_serverids: + self.g.hang_server(i_serverid, **kwargs) + hung_serverids.add(i_serverid) + def _delete_all_shares_from(self, servers): serverids = [id for (id, ss) in servers] for (i_shnum, i_serverid, i_sharefile) in self.shares: @@ -206,49 +216,114 @@ class HungServerDownloadTest(GridTestMixin, ShouldFailMixin, unittest.TestCase): # The tests below do not currently pass for mutable files. - def test_3_good_7_hung(self): + def test_3_good_7_hung_immutable(self): d = defer.succeed(None) - for mutable in [False]: - d.addCallback(lambda ign: self._set_up(mutable, "test_3_good_7_hung")) - d.addCallback(lambda ign: self._hang(self.servers[3:])) - d.addCallback(lambda ign: self._download_and_check()) + d.addCallback(lambda ign: self._set_up(False, "test_3_good_7_hung")) + d.addCallback(lambda ign: self._hang(self.servers[3:])) + d.addCallback(lambda ign: self._download_and_check()) return d - def test_2_good_8_hung_then_1_recovers(self): + def test_5_overdue_immutable(self): + # restrict the ShareFinder to only allow 5 outstanding requests, and + # arrange for the first 5 servers to hang. Then trigger the OVERDUE + # timers (simulating 10 seconds passed), at which point the + # ShareFinder should send additional queries and finish the download + # quickly. If we didn't have OVERDUE timers, this test would fail by + # timing out. + done = [] + d = self._set_up(False, "test_5_overdue_immutable") + def _reduce_max_outstanding_requests_and_download(ign): + self._hang_shares(range(5)) + n = self.c0.create_node_from_uri(self.uri) + self._sf = n._cnode._node._sharefinder + self._sf.max_outstanding_requests = 5 + self._sf.OVERDUE_TIMEOUT = 1000.0 + d2 = download_to_data(n) + # start download, but don't wait for it to complete yet + def _done(res): + done.append(res) # we will poll for this later + d2.addBoth(_done) + d.addCallback(_reduce_max_outstanding_requests_and_download) + from foolscap.eventual import fireEventually, flushEventualQueue + # wait here a while + d.addCallback(lambda res: fireEventually(res)) + d.addCallback(lambda res: flushEventualQueue()) + d.addCallback(lambda ign: self.failIf(done)) + def _check_waiting(ign): + # all the share requests should now be stuck waiting + self.failUnlessEqual(len(self._sf.pending_requests), 5) + # but none should be marked as OVERDUE until the timers expire + self.failUnlessEqual(len(self._sf.overdue_requests), 0) + d.addCallback(_check_waiting) + def _mark_overdue(ign): + # declare four requests overdue, allowing new requests to take + # their place, and leaving one stuck. The finder will keep + # sending requests until there are 5 non-overdue ones + # outstanding, at which point we'll have 4 OVERDUE, 1 + # stuck-but-not-overdue, and 4 live requests. All 4 live requests + # will retire before the download is complete and the ShareFinder + # is shut off. That will leave 4 OVERDUE and 1 + # stuck-but-not-overdue, for a total of 5 requests in in + # _sf.pending_requests + for t in self._sf.overdue_timers.values()[:4]: + t.reset(-1.0) + # the timers ought to fire before the eventual-send does + return fireEventually() + d.addCallback(_mark_overdue) + def _we_are_done(): + return bool(done) + d.addCallback(lambda ign: self.poll(_we_are_done)) + def _check_done(ign): + self.failUnlessEqual(done, [immutable_plaintext]) + self.failUnlessEqual(len(self._sf.pending_requests), 5) + self.failUnlessEqual(len(self._sf.overdue_requests), 4) + d.addCallback(_check_done) + return d + + def test_3_good_7_hung_mutable(self): + raise unittest.SkipTest("still broken") d = defer.succeed(None) - for mutable in [False]: - d.addCallback(lambda ign: self._set_up(mutable, "test_2_good_8_hung_then_1_recovers")) - d.addCallback(lambda ign: self._hang(self.servers[2:3])) - d.addCallback(lambda ign: self._hang(self.servers[3:])) - d.addCallback(lambda ign: self._unhang(self.servers[2:3])) - d.addCallback(lambda ign: self._download_and_check()) + d.addCallback(lambda ign: self._set_up(True, "test_3_good_7_hung")) + d.addCallback(lambda ign: self._hang(self.servers[3:])) + d.addCallback(lambda ign: self._download_and_check()) return d - def test_2_good_8_hung_then_1_recovers_with_2_shares(self): + def test_2_good_8_hung_then_1_recovers_immutable(self): d = defer.succeed(None) - for mutable in [False]: - d.addCallback(lambda ign: self._set_up(mutable, "test_2_good_8_hung_then_1_recovers_with_2_shares")) - d.addCallback(lambda ign: self._copy_all_shares_from(self.servers[0:1], self.servers[2])) - d.addCallback(lambda ign: self._hang(self.servers[2:3])) - d.addCallback(lambda ign: self._hang(self.servers[3:])) - d.addCallback(lambda ign: self._unhang(self.servers[2:3])) - d.addCallback(lambda ign: self._download_and_check()) + d.addCallback(lambda ign: self._set_up(False, "test_2_good_8_hung_then_1_recovers")) + d.addCallback(lambda ign: self._hang(self.servers[2:3])) + d.addCallback(lambda ign: self._hang(self.servers[3:])) + d.addCallback(lambda ign: self._unhang(self.servers[2:3])) + d.addCallback(lambda ign: self._download_and_check()) + return d + + def test_2_good_8_hung_then_1_recovers_mutable(self): + raise unittest.SkipTest("still broken") + d = defer.succeed(None) + d.addCallback(lambda ign: self._set_up(True, "test_2_good_8_hung_then_1_recovers")) + d.addCallback(lambda ign: self._hang(self.servers[2:3])) + d.addCallback(lambda ign: self._hang(self.servers[3:])) + d.addCallback(lambda ign: self._unhang(self.servers[2:3])) + d.addCallback(lambda ign: self._download_and_check()) return d - def test_failover_during_stage_4(self): - raise unittest.SkipTest("needs rewrite") - # See #287 + def test_2_good_8_hung_then_1_recovers_with_2_shares_immutable(self): d = defer.succeed(None) - for mutable in [False]: - d.addCallback(lambda ign: self._set_up(mutable, "test_failover_during_stage_4")) - d.addCallback(lambda ign: self._corrupt_all_shares_in(self.servers[2:3], _corrupt_share_data)) - d.addCallback(lambda ign: self._set_up(mutable, "test_failover_during_stage_4")) - d.addCallback(lambda ign: self._hang(self.servers[3:])) - d.addCallback(lambda ign: self._start_download()) - def _after_starting_download((doned, started4d)): - started4d.addCallback(lambda ign: self._unhang(self.servers[3:4])) - doned.addCallback(self._check) - return doned - d.addCallback(_after_starting_download) + d.addCallback(lambda ign: self._set_up(False, "test_2_good_8_hung_then_1_recovers_with_2_shares")) + d.addCallback(lambda ign: self._copy_all_shares_from(self.servers[0:1], self.servers[2])) + d.addCallback(lambda ign: self._hang(self.servers[2:3])) + d.addCallback(lambda ign: self._hang(self.servers[3:])) + d.addCallback(lambda ign: self._unhang(self.servers[2:3])) + d.addCallback(lambda ign: self._download_and_check()) + return d + def test_2_good_8_hung_then_1_recovers_with_2_shares_mutable(self): + raise unittest.SkipTest("still broken") + d = defer.succeed(None) + d.addCallback(lambda ign: self._set_up(True, "test_2_good_8_hung_then_1_recovers_with_2_shares")) + d.addCallback(lambda ign: self._copy_all_shares_from(self.servers[0:1], self.servers[2])) + d.addCallback(lambda ign: self._hang(self.servers[2:3])) + d.addCallback(lambda ign: self._hang(self.servers[3:])) + d.addCallback(lambda ign: self._unhang(self.servers[2:3])) + d.addCallback(lambda ign: self._download_and_check()) return d