that we want to track and report whether or not each server
responded.)"""
- d = server.callRemote("get_buckets", storageindex)
if self._add_lease:
renew_secret = self._get_renewal_secret(serverid)
cancel_secret = self._get_cancel_secret(serverid)
d2 = server.callRemote("add_lease", storageindex,
renew_secret, cancel_secret)
- dl = defer.DeferredList([d, d2], consumeErrors=True)
- def _done(res):
- [(get_success, get_result),
- (addlease_success, addlease_result)] = res
- # ignore remote IndexError on the add_lease call. Propagate
- # local errors and remote non-IndexErrors
- if addlease_success:
- return get_result
- if not addlease_result.check(RemoteException):
- # Propagate local errors
- return addlease_result
- if addlease_result.value.failure.check(IndexError):
- # tahoe=1.3.0 raised IndexError on non-existant
- # buckets, which we ignore
- return get_result
- # propagate remote errors that aren't IndexError, including
- # the unfortunate internal KeyError bug that <1.3.0 had.
- return addlease_result
- dl.addCallback(_done)
- d = dl
+ d2.addErrback(self._add_lease_failed, serverid, storageindex)
+ d = server.callRemote("get_buckets", storageindex)
def _wrap_results(res):
return (res, serverid, True)
d.addCallbacks(_wrap_results, _trap_errs)
return d
+ def _add_lease_failed(self, f, peerid, storage_index):
+ # Older versions of Tahoe didn't handle the add-lease message very
+ # well: <=1.1.0 throws a NameError because it doesn't implement
+ # remote_add_lease(), 1.2.0/1.3.0 throw IndexError on unknown buckets
+ # (which is most of them, since we send add-lease to everybody,
+ # before we know whether or not they have any shares for us), and
+ # 1.2.0 throws KeyError even on known buckets due to an internal bug
+ # in the latency-measuring code.
+
+ # we want to ignore the known-harmless errors and log the others. In
+ # particular we want to log any local errors caused by coding
+ # problems.
+
+ if f.check(RemoteException):
+ if f.value.failure.check(KeyError, IndexError, NameError):
+ # this may ignore a bit too much, but that only hurts us
+ # during debugging
+ return
+ self.log(format="error in add_lease from [%(peerid)s]: %(f_value)s",
+ peerid=idlib.shortnodeid_b2a(peerid),
+ f_value=str(f.value),
+ failure=f,
+ level=log.WEIRD, umid="atbAxw")
+ return
+ # local errors are cause for alarm
+ log.err(format="local error in add_lease to [%(peerid)s]: %(f_value)s",
+ peerid=idlib.shortnodeid_b2a(peerid),
+ f_value=str(f.value),
+ failure=f,
+ level=log.WEIRD, umid="hEGuQg")
+
+
def _download_and_verify(self, serverid, sharenum, bucket):
"""Start an attempt to download and verify every block in this bucket
and return a deferred that will eventually fire once the attempt
return d
def _do_read(self, ss, peerid, storage_index, shnums, readv):
- d = ss.callRemote("slot_readv", storage_index, shnums, readv)
if self._add_lease:
+ # send an add-lease message in parallel. The results are handled
+ # separately. This is sent before the slot_readv() so that we can
+ # be sure the add_lease is retired by the time slot_readv comes
+ # back (this relies upon our knowledge that the server code for
+ # add_lease is synchronous).
renew_secret = self._node.get_renewal_secret(peerid)
cancel_secret = self._node.get_cancel_secret(peerid)
d2 = ss.callRemote("add_lease", storage_index,
renew_secret, cancel_secret)
- dl = defer.DeferredList([d, d2], consumeErrors=True)
- def _done(res):
- [(readv_success, readv_result),
- (addlease_success, addlease_result)] = res
- # ignore remote IndexError on the add_lease call. Propagate
- # local errors and remote non-IndexErrors
- if addlease_success:
- return readv_result
- if not addlease_result.check(RemoteException):
- # Propagate local errors
- return addlease_result
- if addlease_result.value.failure.check(IndexError):
- # tahoe=1.3.0 raised IndexError on non-existant
- # buckets, which we ignore
- return readv_result
- # propagate remote errors that aren't IndexError, including
- # the unfortunate internal KeyError bug that <1.3.0 had.
- return addlease_result
- dl.addCallback(_done)
- return dl
+ # we ignore success
+ d2.addErrback(self._add_lease_failed, peerid, storage_index)
+ d = ss.callRemote("slot_readv", storage_index, shnums, readv)
return d
def _got_results(self, datavs, peerid, readsize, stuff, started):
self._status.set_privkey_from(peerid)
+ def _add_lease_failed(self, f, peerid, storage_index):
+ # Older versions of Tahoe didn't handle the add-lease message very
+ # well: <=1.1.0 throws a NameError because it doesn't implement
+ # remote_add_lease(), 1.2.0/1.3.0 throw IndexError on unknown buckets
+ # (which is most of them, since we send add-lease to everybody,
+ # before we know whether or not they have any shares for us), and
+ # 1.2.0 throws KeyError even on known buckets due to an internal bug
+ # in the latency-measuring code.
+
+ # we want to ignore the known-harmless errors and log the others. In
+ # particular we want to log any local errors caused by coding
+ # problems.
+
+ if f.check(RemoteException):
+ if f.value.failure.check(KeyError, IndexError, NameError):
+ # this may ignore a bit too much, but that only hurts us
+ # during debugging
+ return
+ self.log(format="error in add_lease from [%(peerid)s]: %(f_value)s",
+ peerid=idlib.shortnodeid_b2a(peerid),
+ f_value=str(f.value),
+ failure=f,
+ level=log.WEIRD, umid="iqg3mw")
+ return
+ # local errors are cause for alarm
+ log.err(format="local error in add_lease to [%(peerid)s]: %(f_value)s",
+ peerid=idlib.shortnodeid_b2a(peerid),
+ f_value=str(f.value),
+ failure=f,
+ level=log.WEIRD, umid="ZWh6HA")
+
def _query_failed(self, f, peerid):
if not self._running:
return
from allmydata import check_results, uri
from allmydata.web import check_results as web_check_results
from allmydata.storage_client import StorageFarmBroker, NativeStorageClientDescriptor
+from allmydata.monitor import Monitor
+from allmydata.test.no_network import GridTestMixin
+from allmydata.immutable.upload import Data
from common_web import WebRenderingMixin
class FakeClient:
d.addCallback(_got_lit_results)
return d
+class AddLease(GridTestMixin, unittest.TestCase):
+ # test for #875, in which failures in the add-lease call cause
+ # false-negatives in the checker
+
+ def test_875(self):
+ self.basedir = "checker/AddLease/875"
+ self.set_up_grid(num_servers=1)
+ c0 = self.g.clients[0]
+ self.uris = {}
+ DATA = "data" * 100
+ d = c0.upload(Data(DATA, convergence=""))
+ def _stash_immutable(ur):
+ self.imm = c0.create_node_from_uri(ur.uri)
+ d.addCallback(_stash_immutable)
+ d.addCallback(lambda ign: c0.create_mutable_file("contents"))
+ def _stash_mutable(node):
+ self.mut = node
+ d.addCallback(_stash_mutable)
+
+ def _check_cr(cr, which):
+ self.failUnless(cr.is_healthy(), which)
+
+ # these two should work normally
+ d.addCallback(lambda ign: self.imm.check(Monitor(), add_lease=True))
+ d.addCallback(_check_cr, "immutable-normal")
+ d.addCallback(lambda ign: self.mut.check(Monitor(), add_lease=True))
+ d.addCallback(_check_cr, "mutable-normal")
+
+ really_did_break = []
+ # now break the server's remote_add_lease call
+ def _break_add_lease(ign):
+ def broken_add_lease(*args, **kwargs):
+ really_did_break.append(1)
+ raise KeyError("intentional failure, should be ignored")
+ assert self.g.servers_by_number[0].remote_add_lease
+ self.g.servers_by_number[0].remote_add_lease = broken_add_lease
+ d.addCallback(_break_add_lease)
+
+ # and confirm that the files still look healthy
+ d.addCallback(lambda ign: self.mut.check(Monitor(), add_lease=True))
+ d.addCallback(_check_cr, "mutable-broken")
+ d.addCallback(lambda ign: self.imm.check(Monitor(), add_lease=True))
+ d.addCallback(_check_cr, "immutable-broken")
+
+ d.addCallback(lambda ign: self.failUnless(really_did_break))
+ return d