From: Brian Warner Date: Tue, 26 Aug 2008 00:51:55 +0000 (-0700) Subject: logging cleanups: lower DeadReferenceError from WEIRD (which provokes Incidents)... X-Git-Url: https://git.rkrishnan.org/%5B/%5D%20/reliability?a=commitdiff_plain;h=735aa895b91b44ae450768ec7d17b18a8628e57a;p=tahoe-lafs%2Ftahoe-lafs.git logging cleanups: lower DeadReferenceError from WEIRD (which provokes Incidents) to merely UNUSUAL, don't pre-format Failures in others --- diff --git a/src/allmydata/immutable/download.py b/src/allmydata/immutable/download.py index 878d38d0..c719739b 100644 --- a/src/allmydata/immutable/download.py +++ b/src/allmydata/immutable/download.py @@ -4,6 +4,7 @@ from zope.interface import implements from twisted.internet import defer from twisted.internet.interfaces import IPushProducer, IConsumer from twisted.application import service +from foolscap import DeadReferenceError from foolscap.eventual import eventually from allmydata.util import base32, mathutil, hashutil, log @@ -264,10 +265,10 @@ class BlockDownloader: self.results = results self._log_number = self.parent.log("starting block %d" % blocknum) - def log(self, msg, parent=None): - if parent is None: - parent = self._log_number - return self.parent.log(msg, parent=parent) + def log(self, *args, **kwargs): + if "parent" not in kwargs: + kwargs["parent"] = self._log_number + return self.parent.log(*args, **kwargs) def start(self, segnum): lognum = self.log("get_block(segnum=%d)" % segnum) @@ -288,8 +289,11 @@ class BlockDownloader: self.parent.hold_block(self.blocknum, data) def _got_block_error(self, f, lognum): - self.log("BlockDownloader[%d] got error: %s" % (self.blocknum, f), - parent=lognum) + level = log.WEIRD + if f.check(DeadReferenceError): + level = log.UNUSUAL + self.log("BlockDownloader[%d] got error" % self.blocknum, + failure=f, level=level, parent=lognum) if self.results: peerid = self.vbucket.bucket.get_peerid() self.results.server_problems[peerid] = str(f) @@ -311,10 +315,10 @@ class SegmentDownloader: self._log_number = self.parent.log("starting segment %d" % segmentnumber) - def log(self, msg, parent=None): - if parent is None: - parent = self._log_number - return self.parent.log(msg, parent=parent) + def log(self, *args, **kwargs): + if "parent" not in kwargs: + kwargs["parent"] = self._log_number + return self.parent.log(*args, **kwargs) def start(self): return self._download() @@ -588,7 +592,10 @@ class FileDownloader: self._share_buckets.append( (sharenum, bucket) ) def _got_error(self, f): - self._client.log("Somebody failed. -- %s" % (f,)) + level = log.WEIRD + if f.check(DeadReferenceError): + level = log.UNUSUAL + self._client.log("Error during get_buckets", failure=f, level=level) def bucket_failed(self, vbucket): shnum = vbucket.sharenum @@ -656,8 +663,12 @@ class FileDownloader: d.addCallback(lambda res: getattr(bucket, methname)(*args)) d.addCallback(validatorfunc, bucket) def _bad(f): - self.log("%s from vbucket %s failed:" % (name, bucket), - failure=f, level=log.WEIRD) + level = log.WEIRD + if f.check(DeadReferenceError): + level = log.UNUSUAL + self.log(format="operation %(op)s from vbucket %(vbucket)s failed", + op=name, vbucket=str(bucket), + failure=f, level=level) if not sources: raise NotEnoughSharesError("ran out of peers, last error was %s" % (f,)) diff --git a/src/allmydata/mutable/publish.py b/src/allmydata/mutable/publish.py index 15df0919..22a496ad 100644 --- a/src/allmydata/mutable/publish.py +++ b/src/allmydata/mutable/publish.py @@ -117,6 +117,8 @@ class Publish: def log(self, *args, **kwargs): if 'parent' not in kwargs: kwargs['parent'] = self._log_number + if "facility" not in kwargs: + kwargs["facility"] = "tahoe.mutable.publish" return log.msg(*args, **kwargs) def publish(self, newdata): @@ -239,7 +241,7 @@ class Publish: self.bad_share_checkstrings[ (peerid, shnum) ] = old_checkstring self.connections[peerid] = self._servermap.connections[peerid] - # create the shares. We'll discard these as they are delivered. SMDF: + # create the shares. We'll discard these as they are delivered. SDMF: # we're allowed to hold everything in memory. self._status.timings["setup"] = time.time() - self._started @@ -268,7 +270,7 @@ class Publish: assert self.num_segments in [0, 1,] # SDMF restrictions def _fatal_error(self, f): - self.log("error during loop", failure=f, level=log.SCARY) + self.log("error during loop", failure=f, level=log.UNUSUAL) self._done(f) def _update_status(self): diff --git a/src/allmydata/mutable/retrieve.py b/src/allmydata/mutable/retrieve.py index c93d1d28..9000f421 100644 --- a/src/allmydata/mutable/retrieve.py +++ b/src/allmydata/mutable/retrieve.py @@ -4,6 +4,7 @@ from itertools import count from zope.interface import implements from twisted.internet import defer from twisted.python import failure +from foolscap import DeadReferenceError from foolscap.eventual import eventually, fireEventually from allmydata.interfaces import IRetrieveStatus from allmydata.util import hashutil, idlib, log @@ -113,6 +114,8 @@ class Retrieve: def log(self, *args, **kwargs): if "parent" not in kwargs: kwargs["parent"] = self._log_number + if "facility" not in kwargs: + kwargs["facility"] = "tahoe.mutable.retrieve" return log.msg(*args, **kwargs) def download(self): @@ -247,7 +250,8 @@ class Retrieve: except CorruptShareError, e: # log it and give the other shares a chance to be processed f = failure.Failure() - self.log("bad share: %s %s" % (f, f.value), level=log.WEIRD) + self.log(format="bad share: %(f_value)s", + f_value=str(f.value), failure=f, level=log.WEIRD) self.remove_peer(peerid) self.servermap.mark_bad_share(peerid, shnum, prefix) self._bad_shares.add( (peerid, shnum) ) @@ -301,7 +305,11 @@ class Retrieve: return self._last_failure = f self.remove_peer(peerid) - self.log("error during query: %s %s" % (f, f.value), level=log.WEIRD) + level = log.WEIRD + if f.check(DeadReferenceError): + level = log.UNUSUAL + self.log(format="error during query: %(f_value)s", + f_value=str(f.value), failure=f, level=level) def _check_for_done(self, res): # exit paths: diff --git a/src/allmydata/mutable/servermap.py b/src/allmydata/mutable/servermap.py index 2e2e12a9..69b75b3e 100644 --- a/src/allmydata/mutable/servermap.py +++ b/src/allmydata/mutable/servermap.py @@ -360,6 +360,8 @@ class ServermapUpdater: def log(self, *args, **kwargs): if "parent" not in kwargs: kwargs["parent"] = self._log_number + if "facility" not in kwargs: + kwargs["facility"] = "tahoe.mutable.mapupdate" return log.msg(*args, **kwargs) def update(self): @@ -537,8 +539,8 @@ class ServermapUpdater: except CorruptShareError, e: # log it and give the other shares a chance to be processed f = failure.Failure() - self.log("bad share: %s %s" % (f, f.value), - parent=lp, level=log.WEIRD) + self.log(format="bad share: %(f_value)s", f_value=str(f.value), + failure=f, parent=lp, level=log.WEIRD) self._bad_peers.add(peerid) self._last_failure = f checkstring = data[:SIGNED_PREFIX_LENGTH] @@ -689,7 +691,8 @@ class ServermapUpdater: def _query_failed(self, f, peerid): - self.log("error during query: %s %s" % (f, f.value), level=log.WEIRD) + self.log(format="error during query: %(f_value)s", + f_value=str(f.value), failure=f, level=log.WEIRD) if not self._running: return self._must_query.discard(peerid) @@ -716,7 +719,8 @@ class ServermapUpdater: def _privkey_query_failed(self, f, peerid, shnum, lp): self._queries_outstanding.discard(peerid) - self.log("error during privkey query: %s %s" % (f, f.value), + self.log(format="error during privkey query: %(f_value)s", + f_value=str(f.value), failure=f, parent=lp, level=log.WEIRD) if not self._running: return