From 4bb88fd2ee8023619864eda10a81075db6b0827f Mon Sep 17 00:00:00 2001
From: Brian Warner <warner@lothar.com>
Date: Wed, 10 Sep 2008 01:45:04 -0700
Subject: [PATCH] dirnode: refactor recursive-traversal methods, add stats to
 deep_check() method results and t=deep-check webapi

---
 docs/webapi.txt                      |   4 +
 src/allmydata/checker_results.py     |   7 +
 src/allmydata/dirnode.py             | 272 +++++++++++++--------------
 src/allmydata/interfaces.py          |   8 +
 src/allmydata/test/test_system.py    |   1 +
 src/allmydata/web/checker_results.py |   2 +
 6 files changed, 155 insertions(+), 139 deletions(-)

diff --git a/docs/webapi.txt b/docs/webapi.txt
index 97bb5702..cfc3611d 100644
--- a/docs/webapi.txt
+++ b/docs/webapi.txt
@@ -765,6 +765,8 @@ POST $URL?t=deep-check
                          invoked. The 'check-results' field is the same as
                          that returned by t=check&output=JSON, described
                          above.
+   stats: a dictionary with the same keys as the t=deep-stats command
+          (described below)
 
 POST $URL?t=check&repair=true
 
@@ -840,6 +842,8 @@ POST $URL?t=deep-check&repair=true
                          invoked. The 'check-results' field is the same as
                          that returned by t=check&repair=true&output=JSON,
                          described above.
+   stats: a dictionary with the same keys as the t=deep-stats command
+          (described below)
 
 GET $DIRURL?t=manifest
 
diff --git a/src/allmydata/checker_results.py b/src/allmydata/checker_results.py
index f25a39b1..dd4e50c7 100644
--- a/src/allmydata/checker_results.py
+++ b/src/allmydata/checker_results.py
@@ -91,6 +91,10 @@ class DeepResultsBase:
         self.objects_unhealthy = 0
         self.corrupt_shares = []
         self.all_results = {}
+        self.stats = {}
+
+    def update_stats(self, new_stats):
+        self.stats.update(new_stats)
 
     def get_root_storage_index_string(self):
         return self.root_storage_index_s
@@ -101,6 +105,9 @@ class DeepResultsBase:
     def get_all_results(self):
         return self.all_results
 
+    def get_stats(self):
+        return self.stats
+
 
 class DeepCheckResults(DeepResultsBase):
     implements(IDeepCheckResults)
diff --git a/src/allmydata/dirnode.py b/src/allmydata/dirnode.py
index 14cabdf7..3b3fbaee 100644
--- a/src/allmydata/dirnode.py
+++ b/src/allmydata/dirnode.py
@@ -7,7 +7,7 @@ import simplejson
 from allmydata.mutable.common import NotMutableError
 from allmydata.mutable.node import MutableFileNode
 from allmydata.interfaces import IMutableFileNode, IDirectoryNode,\
-     IURI, IFileNode, IMutableFileURI, IVerifierURI, IFilesystemNode, \
+     IURI, IFileNode, IMutableFileURI, IFilesystemNode, \
      ExistingChildError, ICheckable
 from allmydata.checker_results import DeepCheckResults, \
      DeepCheckAndRepairResults
@@ -446,161 +446,92 @@ class NewDirectoryNode:
         d.addCallback(lambda child: self.delete(current_child_name))
         return d
 
-    def build_manifest(self):
-        """Return a frozenset of verifier-capability strings for all nodes
-        (directories and files) reachable from this one."""
+
+    def deep_traverse(self, walker):
+        """Perform a recursive walk, using this dirnode as a root, notifying
+        the 'walker' instance of everything I encounter.
+
+        I call walker.enter_directory(parent, children) once for each dirnode
+        I visit, immediately after retrieving the list of children. I pass in
+        the parent dirnode and the dict of childname->(childnode,metadata).
+        This function should *not* traverse the children: I will do that.
+        enter_directory() is most useful for the deep-stats number that
+        counts how large a directory is.
+
+        I call walker.add_node(node, path) for each node (both files and
+        directories) I can reach. Most work should be done here.
+
+        I avoid loops by keeping track of verifier-caps and refusing to call
+        each() or traverse a node that I've seen before.
+
+        I return a Deferred that will fire with the value of walker.finish().
+        """
 
         # this is just a tree-walker, except that following each edge
         # requires a Deferred. We use a ConcurrencyLimiter to make sure the
         # fan-out doesn't cause problems.
 
-        manifest = set()
-        manifest.add(self.get_verifier())
-        limiter = ConcurrencyLimiter(10) # allow 10 in parallel
-
-        d = self._build_manifest_from_node(self, manifest, limiter)
-        def _done(res):
-            # LIT nodes have no verifier-capability: their data is stored
-            # inside the URI itself, so there is no need to refresh anything.
-            # They indicate this by returning None from their get_verifier
-            # method. We need to remove any such Nones from our set. We also
-            # want to convert all these caps into strings.
-            return frozenset([IVerifierURI(cap).to_string()
-                              for cap in manifest
-                              if cap is not None])
-        d.addCallback(_done)
+        found = set([self.get_verifier()])
+        limiter = ConcurrencyLimiter(10)
+        d = self._deep_traverse_dirnode(self, [], walker, found, limiter)
+        d.addCallback(lambda ignored: walker.finish())
         return d
 
-    def _build_manifest_from_node(self, node, manifest, limiter):
-        d = limiter.add(node.list)
-        def _got_list(res):
-            dl = []
-            for name, (child, metadata) in res.iteritems():
-                verifier = child.get_verifier()
-                if verifier not in manifest:
-                    manifest.add(verifier)
-                    if IDirectoryNode.providedBy(child):
-                        dl.append(self._build_manifest_from_node(child,
-                                                                 manifest,
-                                                                 limiter))
-            if dl:
-                return defer.DeferredList(dl)
-        d.addCallback(_got_list)
+    def _deep_traverse_dirnode(self, node, path, walker, found, limiter):
+        # process this directory, then walk its children
+        d = limiter.add(walker.add_node, node, path)
+        d.addCallback(lambda ignored: node.list())
+        d.addCallback(self._deep_traverse_dirnode_children, node, path,
+                      walker, found, limiter)
         return d
 
-    def deep_stats(self):
-        stats = DeepStats()
-        # we track verifier caps, to avoid double-counting children for which
-        # we've got both a write-cap and a read-cap
-        found = set()
-        found.add(self.get_verifier())
+    def _deep_traverse_dirnode_children(self, children, parent, path,
+                                        walker, found, limiter):
+        dl = [limiter.add(walker.enter_directory, parent, children)]
+        for name, (child, metadata) in children.iteritems():
+            verifier = child.get_verifier()
+            if verifier in found:
+                continue
+            found.add(verifier)
+            childpath = path + [name]
+            if IDirectoryNode.providedBy(child):
+                dl.append(self._deep_traverse_dirnode(child, childpath,
+                                                      walker, found, limiter))
+            else:
+                dl.append(limiter.add(walker.add_node, child, childpath))
+        return defer.DeferredList(dl, fireOnOneErrback=True)
 
-        limiter = ConcurrencyLimiter(10)
 
-        d = self._add_deepstats_from_node(self, found, stats, limiter)
-        d.addCallback(lambda res: stats.get_results())
-        return d
+    def build_manifest(self):
+        """Return a frozenset of verifier-capability strings for all nodes
+        (directories and files) reachable from this one."""
+        return self.deep_traverse(ManifestWalker())
 
-    def _add_deepstats_from_node(self, node, found, stats, limiter):
-        d = limiter.add(node.list)
-        def _got_list(children):
-            dl = []
-            dirsize_bytes = node.get_size()
-            dirsize_children = len(children)
-            stats.add("count-directories")
-            stats.add("size-directories", dirsize_bytes)
-            stats.max("largest-directory", dirsize_bytes)
-            stats.max("largest-directory-children", dirsize_children)
-            for name, (child, metadata) in children.iteritems():
-                verifier = child.get_verifier()
-                if verifier in found:
-                    continue
-                found.add(verifier)
-                if IDirectoryNode.providedBy(child):
-                    dl.append(self._add_deepstats_from_node(child, found,
-                                                            stats, limiter))
-                elif IMutableFileNode.providedBy(child):
-                    stats.add("count-files")
-                    stats.add("count-mutable-files")
-                    # TODO: update the servermap, compute a size, add it to
-                    # size-mutable-files, max it into "largest-mutable-file"
-                elif IFileNode.providedBy(child): # CHK and LIT
-                    stats.add("count-files")
-                    size = child.get_size()
-                    stats.histogram("size-files-histogram", size)
-                    if child.get_uri().startswith("URI:LIT:"):
-                        stats.add("count-literal-files")
-                        stats.add("size-literal-files", size)
-                    else:
-                        stats.add("count-immutable-files")
-                        stats.add("size-immutable-files", size)
-                        stats.max("largest-immutable-file", size)
-            if dl:
-                return defer.DeferredList(dl)
-        d.addCallback(_got_list)
-        return d
+    def deep_stats(self):
+        # Since deep_traverse tracks verifier caps, we avoid double-counting
+        # children for which we've got both a write-cap and a read-cap
+        return self.deep_traverse(DeepStats())
 
     def deep_check(self, verify=False):
-        return self.deep_check_base(verify, False)
-    def deep_check_and_repair(self, verify=False):
-        return self.deep_check_base(verify, True)
-
-    def deep_check_base(self, verify, repair):
-        # shallow-check each object first, then traverse children
-        root_si = self._node.get_storage_index()
-        self._lp = log.msg(format="deep-check starting (%(si)s),"
-                           " verify=%(verify)s, repair=%(repair)s",
-                           si=base32.b2a(root_si), verify=verify, repair=repair)
-        if repair:
-            results = DeepCheckAndRepairResults(root_si)
-        else:
-            results = DeepCheckResults(root_si)
-        found = set()
-        limiter = ConcurrencyLimiter(10)
+        return self.deep_traverse(DeepChecker(self, verify, repair=False))
 
-        d = self._add_deepcheck_from_node([], self, results, found, limiter,
-                                          verify, repair)
-        def _done(res):
-            log.msg("deep-check done", parent=self._lp)
-            return results
-        d.addCallback(_done)
-        return d
-
-    def _add_deepcheck_from_node(self, path, node, results, found, limiter,
-                                 verify, repair):
-        verifier = node.get_verifier()
-        if verifier in found:
-            # avoid loops
-            return None
-        found.add(verifier)
-
-        if repair:
-            d = limiter.add(node.check_and_repair, verify)
-            d.addCallback(results.add_check_and_repair, path)
-        else:
-            d = limiter.add(node.check, verify)
-            d.addCallback(results.add_check, path)
+    def deep_check_and_repair(self, verify=False):
+        return self.deep_traverse(DeepChecker(self, verify, repair=True))
 
-        # TODO: stats: split the DeepStats.foo calls out of
-        # _add_deepstats_from_node into a separate non-recursing method, call
-        # it from both here and _add_deepstats_from_node.
 
-        if IDirectoryNode.providedBy(node):
-            d.addCallback(lambda res: node.list())
-            def _got_children(children):
-                dl = []
-                for name, (child, metadata) in children.iteritems():
-                    childpath = path + [name]
-                    d2 = self._add_deepcheck_from_node(childpath, child,
-                                                       results,
-                                                       found, limiter,
-                                                       verify, repair)
-                    if d2:
-                        dl.append(d2)
-                if dl:
-                    return defer.DeferredList(dl, fireOnOneErrback=True)
-            d.addCallback(_got_children)
-        return d
+class ManifestWalker:
+    def __init__(self):
+        self.manifest = set()
+    def add_node(self, node, path):
+        v = node.get_verifier()
+        # LIT files have no verify-cap, so don't add them
+        if v:
+            assert not isinstance(v, str), "ICK: %s %s" % (v, node)
+            self.manifest.add(v.to_string())
+    def enter_directory(self, parent, children):
+        pass
+    def finish(self):
+        return frozenset(self.manifest)
 
 
 class DeepStats:
@@ -627,6 +558,33 @@ class DeepStats:
         self.buckets = [ (0,0), (1,3)]
         self.root = math.sqrt(10)
 
+    def add_node(self, node, childpath):
+        if IDirectoryNode.providedBy(node):
+            self.add("count-directories")
+        elif IMutableFileNode.providedBy(node):
+            self.add("count-files")
+            self.add("count-mutable-files")
+            # TODO: update the servermap, compute a size, add it to
+            # size-mutable-files, max it into "largest-mutable-file"
+        elif IFileNode.providedBy(node): # CHK and LIT
+            self.add("count-files")
+            size = node.get_size()
+            self.histogram("size-files-histogram", size)
+            if node.get_uri().startswith("URI:LIT:"):
+                self.add("count-literal-files")
+                self.add("size-literal-files", size)
+            else:
+                self.add("count-immutable-files")
+                self.add("size-immutable-files", size)
+                self.max("largest-immutable-file", size)
+
+    def enter_directory(self, parent, children):
+        dirsize_bytes = parent.get_size()
+        dirsize_children = len(children)
+        self.add("size-directories", dirsize_bytes)
+        self.max("largest-directory", dirsize_bytes)
+        self.max("largest-directory-children", dirsize_children)
+
     def add(self, key, value=1):
         self.stats[key] += value
 
@@ -666,6 +624,42 @@ class DeepStats:
             stats[key] = out
         return stats
 
+    def finish(self):
+        return self.get_results()
+
+
+class DeepChecker:
+    def __init__(self, root, verify, repair):
+        root_si = root.get_storage_index()
+        self._lp = log.msg(format="deep-check starting (%(si)s),"
+                           " verify=%(verify)s, repair=%(repair)s",
+                           si=base32.b2a(root_si), verify=verify, repair=repair)
+        self._verify = verify
+        self._repair = repair
+        if repair:
+            self._results = DeepCheckAndRepairResults(root_si)
+        else:
+            self._results = DeepCheckResults(root_si)
+        self._stats = DeepStats()
+
+    def add_node(self, node, childpath):
+        if self._repair:
+            d = node.check_and_repair(self._verify)
+            d.addCallback(self._results.add_check_and_repair, childpath)
+        else:
+            d = node.check(self._verify)
+            d.addCallback(self._results.add_check, childpath)
+        d.addCallback(lambda ignored: self._stats.add_node(node, childpath))
+        return d
+
+    def enter_directory(self, parent, children):
+        return self._stats.enter_directory(parent, children)
+
+    def finish(self):
+        log.msg("deep-check done", parent=self._lp)
+        self._results.update_stats(self._stats.get_results())
+        return self._results
+
 
 # use client.create_dirnode() to make one of these
 
diff --git a/src/allmydata/interfaces.py b/src/allmydata/interfaces.py
index 69ac4e09..44065ff0 100644
--- a/src/allmydata/interfaces.py
+++ b/src/allmydata/interfaces.py
@@ -1648,6 +1648,10 @@ class IDeepCheckResults(Interface):
         be slash-joined) to an ICheckerResults instance, one for each object
         that was checked."""
 
+    def get_stats():
+        """Return a dictionary with the same keys as
+        IDirectoryNode.deep_stats()."""
+
 class IDeepCheckAndRepairResults(Interface):
     """I contain the results of a deep-check-and-repair operation.
 
@@ -1690,6 +1694,10 @@ class IDeepCheckAndRepairResults(Interface):
                                                repair)
         """
 
+    def get_stats():
+        """Return a dictionary with the same keys as
+        IDirectoryNode.deep_stats()."""
+
     def get_corrupt_shares():
         """Return a set of (serverid, storage_index, sharenum) for all shares
         that were found to be corrupt before any repair was attempted. Both
diff --git a/src/allmydata/test/test_system.py b/src/allmydata/test/test_system.py
index 1260692f..23618697 100644
--- a/src/allmydata/test/test_system.py
+++ b/src/allmydata/test/test_system.py
@@ -2271,6 +2271,7 @@ class DeepCheck(SystemTestMixin, unittest.TestCase):
         self.failUnlessEqual(data["count-corrupt-shares"], 0, where)
         self.failUnlessEqual(data["list-corrupt-shares"], [], where)
         self.failUnlessEqual(data["list-unhealthy-files"], [], where)
+        self.json_check_stats(data["stats"], where)
 
     def json_full_deepcheck_and_repair_is_healthy(self, data, n, where):
         self.failUnlessEqual(data["root-storage-index"],
diff --git a/src/allmydata/web/checker_results.py b/src/allmydata/web/checker_results.py
index a43e075c..efccb82b 100644
--- a/src/allmydata/web/checker_results.py
+++ b/src/allmydata/web/checker_results.py
@@ -196,6 +196,7 @@ class DeepCheckResults(rend.Page, ResultsBase):
                                          for (path_t, r)
                                          in self.r.get_all_results().items()
                                          if not r.is_healthy() ]
+        data["stats"] = self.r.get_stats()
         return simplejson.dumps(data, indent=1)
 
     def render_root_storage_index(self, ctx, data):
@@ -344,6 +345,7 @@ class DeepCheckAndRepairResults(rend.Page, ResultsBase):
                                          for (path_t, r)
                                          in self.r.get_all_results().items()
                                          if not r.get_pre_repair_results().is_healthy() ]
+        data["stats"] = self.r.get_stats()
         return simplejson.dumps(data, indent=1)
 
     def render_root_storage_index(self, ctx, data):
-- 
2.45.2