From: Brian Warner <warner@allmydata.com>
Date: Tue, 15 Apr 2008 22:58:02 +0000 (-0700)
Subject: mutable WIP: if corrupt shares cause a retrieve to fail, restart it once, ignoring... 
X-Git-Tag: allmydata-tahoe-1.1.0~243
X-Git-Url: https://git.rkrishnan.org/pf//%22%3C?a=commitdiff_plain;h=ca14b9939710d9e0e7a5bffa02b82401f3c11a6a;p=tahoe-lafs%2Ftahoe-lafs.git

mutable WIP: if corrupt shares cause a retrieve to fail, restart it once, ignoring those shares and using different ones
---

diff --git a/src/allmydata/encode.py b/src/allmydata/encode.py
index e8d1a1bf..3480ce5f 100644
--- a/src/allmydata/encode.py
+++ b/src/allmydata/encode.py
@@ -61,6 +61,8 @@ hash tree is put into the URI.
 """
 
 class NotEnoughPeersError(Exception):
+    worth_retrying = False
+    servermap = None
     pass
 
 class UploadAborted(Exception):
diff --git a/src/allmydata/mutable/node.py b/src/allmydata/mutable/node.py
index 87295f2c..3ad40470 100644
--- a/src/allmydata/mutable/node.py
+++ b/src/allmydata/mutable/node.py
@@ -7,6 +7,7 @@ from twisted.internet import defer
 from allmydata.interfaces import IMutableFileNode, IMutableFileURI
 from allmydata.util import hashutil
 from allmydata.uri import WriteableSSKFileURI
+from allmydata.encode import NotEnoughPeersError
 from pycryptopp.publickey import rsa
 from pycryptopp.cipher.aes import AES
 
@@ -279,15 +280,30 @@ class MutableFileNode:
         d.addCallback(_done)
         return d
 
-    def download_to_data(self):
-        d = self.obtain_lock()
-        d.addCallback(lambda res: self.update_servermap(mode=MODE_ENOUGH))
+    def _update_and_retrieve_best(self, old_map=None):
+        d = self.update_servermap(old_map=old_map, mode=MODE_ENOUGH)
         def _updated(smap):
             goal = smap.best_recoverable_version()
             if not goal:
                 raise UnrecoverableFileError("no recoverable versions")
             return self.download_version(smap, goal)
         d.addCallback(_updated)
+        return d
+
+    def download_to_data(self):
+        d = self.obtain_lock()
+        d.addCallback(lambda res: self._update_and_retrieve_best())
+        def _maybe_retry(f):
+            f.trap(NotEnoughPeersError)
+            e = f.value
+            if not e.worth_retrying:
+                return f
+            # the download is worth retrying once. Make sure to use the old
+            # servermap, since it is what remembers the bad shares. TODO:
+            # consider allowing this to retry multiple times.. this approach
+            # will let us tolerate about 8 bad shares, I think.
+            return self._update_and_retrieve_best(e.servermap)
+        d.addErrback(_maybe_retry)
         d.addBoth(self.release_lock)
         return d
 
diff --git a/src/allmydata/mutable/retrieve.py b/src/allmydata/mutable/retrieve.py
index 4942d96b..9abc5430 100644
--- a/src/allmydata/mutable/retrieve.py
+++ b/src/allmydata/mutable/retrieve.py
@@ -93,6 +93,7 @@ class Retrieve:
         self._outstanding_queries = {} # maps (peerid,shnum) to start_time
         self._running = True
         self._decoding = False
+        self._bad_shares = set()
 
         self.servermap = servermap
         assert self._node._pubkey
@@ -238,6 +239,8 @@ class Retrieve:
                 f = failure.Failure()
                 self.log("bad share: %s %s" % (f, f.value), level=log.WEIRD)
                 self.remove_peer(peerid)
+                self.servermap.mark_bad_share(peerid, shnum)
+                self._bad_shares.add( (peerid, shnum) )
                 self._last_failure = f
                 pass
         # all done!
@@ -374,19 +377,26 @@ class Retrieve:
             format = ("ran out of peers: "
                       "have %(have)d shares (k=%(k)d), "
                       "%(outstanding)d queries in flight, "
-                      "need %(need)d more")
+                      "need %(need)d more, "
+                      "found %(bad)d bad shares")
+            args = {"have": len(self.shares),
+                    "k": k,
+                    "outstanding": len(self._outstanding_queries),
+                    "need": needed,
+                    "bad": len(self._bad_shares),
+                    }
             self.log(format=format,
-                     have=len(self.shares), k=k,
-                     outstanding=len(self._outstanding_queries),
-                     need=needed,
-                     level=log.WEIRD)
-            msg2 = format % {"have": len(self.shares),
-                             "k": k,
-                             "outstanding": len(self._outstanding_queries),
-                             "need": needed,
-                             }
-            raise NotEnoughPeersError("%s, last failure: %s" %
-                                      (msg2, self._last_failure))
+                     level=log.WEIRD, **args)
+            err = NotEnoughPeersError("%s, last failure: %s" %
+                                      (format % args, self._last_failure))
+            if self._bad_shares:
+                self.log("We found some bad shares this pass. You should "
+                         "update the servermap and try again to check "
+                         "more peers",
+                         level=log.WEIRD)
+                err.worth_retrying = True
+                err.servermap = self.servermap
+            raise err
 
         return
 
diff --git a/src/allmydata/mutable/servermap.py b/src/allmydata/mutable/servermap.py
index 8863ea6e..4f8fca2b 100644
--- a/src/allmydata/mutable/servermap.py
+++ b/src/allmydata/mutable/servermap.py
@@ -25,6 +25,14 @@ class ServerMap:
     operations, which means 'publish this new version, but only if nothing
     has changed since I last retrieved this data'. This reduces the chances
     of clobbering a simultaneous (uncoordinated) write.
+
+    @ivar bad_shares: a sequence of (peerid, shnum) tuples, describing
+                      shares that I should ignore (because a previous user of
+                      the servermap determined that they were invalid). The
+                      updater only locates a certain number of shares: if
+                      some of these turn out to have integrity problems and
+                      are unusable, the caller will need to mark those shares
+                      as bad, then re-update the servermap, then try again.
     """
 
     def __init__(self):
@@ -35,9 +43,36 @@ class ServerMap:
         self.connections = {} # maps peerid to a RemoteReference
         self.unreachable_peers = set() # peerids that didn't respond to queries
         self.problems = [] # mostly for debugging
+        self.bad_shares = set()
         self.last_update_mode = None
         self.last_update_time = 0
 
+    def mark_bad_share(self, peerid, shnum):
+        """This share was found to be bad, not in the checkstring or
+        signature, but deeper in the share, detected at retrieve time. Remove
+        it from our list of useful shares, and remember that it is bad so we
+        don't add it back again later.
+        """
+        self.bad_shares.add( (peerid, shnum) )
+        self._remove_share(peerid, shnum)
+
+    def _remove_share(self, peerid, shnum):
+        #(s_shnum, s_verinfo, s_timestamp) = share
+        to_remove = [share
+                     for share in self.servermap[peerid]
+                     if share[0] == shnum]
+        for share in to_remove:
+            self.servermap[peerid].discard(share)
+        if not self.servermap[peerid]:
+            del self.servermap[peerid]
+
+    def add_new_share(self, peerid, shnum, verinfo, timestamp):
+        """We've written a new share out, replacing any that was there
+        before."""
+        self.bad_shares.discard( (peerid, shnum) )
+        self._remove_share(peerid, shnum)
+        self.servermap.add(peerid, (shnum, verinfo, timestamp) )
+
     def dump(self, out=sys.stdout):
         print >>out, "servermap:"
         for (peerid, shares) in self.servermap.items():
@@ -148,6 +183,11 @@ class ServerMap:
 
 class ServermapUpdater:
     def __init__(self, filenode, servermap, mode=MODE_ENOUGH):
+        """I update a servermap, locating a sufficient number of useful
+        shares and remembering where they are located.
+
+        """
+
         self._node = filenode
         self._servermap = servermap
         self.mode = mode
@@ -415,6 +455,11 @@ class ServermapUpdater:
             self._valid_versions.add(verinfo)
         # We now know that this is a valid candidate verinfo.
 
+        if (peerid, shnum, verinfo) in self._servermap.bad_shares:
+            # we've been told that the rest of the data in this share is
+            # unusable, so don't add it to the servermap.
+            return verinfo
+
         # Add the info to our servermap.
         timestamp = time.time()
         self._servermap.servermap.add(peerid, (shnum, verinfo, timestamp))