From 6c00a70dbc06f94acb4910d22e356820c9e5054f Mon Sep 17 00:00:00 2001
From: Brian Warner <warner@allmydata.com>
Date: Thu, 8 May 2008 13:21:14 -0700
Subject: [PATCH] dirnode: add a deep_stats(), like deep-size but with more
 information. webish adds t=deeps-size too.

---
 docs/webapi.txt                    | 32 ++++++++++++
 src/allmydata/dirnode.py           | 82 ++++++++++++++++++++++++++++++
 src/allmydata/interfaces.py        | 38 +++++++++++++-
 src/allmydata/test/test_dirnode.py | 23 +++++++++
 src/allmydata/test/test_web.py     | 23 +++++++++
 src/allmydata/webish.py            | 14 +++++
 6 files changed, 210 insertions(+), 2 deletions(-)

diff --git a/docs/webapi.txt b/docs/webapi.txt
index 78b3861a..cebe3e26 100644
--- a/docs/webapi.txt
+++ b/docs/webapi.txt
@@ -571,6 +571,38 @@ GET $URL?t=deep-size
   expansion or encoding overhead into account. Later versions of the code may
   improve this estimate upwards.
 
+GET $URL?t=deep-stats
+
+  Return a JSON-encoded dictionary that lists interesting statistics about
+  the set of all files and directories reachable from the given directory:
+
+   count-immutable-files: count of how many CHK files are in the set
+   count-mutable-files: same, for mutable files (does not include directories)
+   count-literal-files: same, for LIT files (data contained inside the URI)
+   count-files: sum of the above three
+   count-directories: count of directories
+   size-immutable-files: total bytes for all CHK files in the set, =deep-size
+   size-mutable-files (TODO): same, for current version of all mutable files
+   size-literal-files: same, for LIT files
+   size-directories: size of directories (includes size-literal-files)
+   largest-directory: number of children in the largest directory
+   largest-immutable-file: number of bytes in the largest CHK file
+
+  size-mutable-files is not implemented, because it would require extra
+  queries to each mutable file to get their size. This may be implemented in
+  the future.
+
+  Assuming no sharing, the basic space consumed by a single root directory is
+  the sum of size-immutable-files, size-mutable-files, and size-directories.
+  The actual disk space used by the shares is larger, because of the
+  following sources of overhead:
+
+   integrity data
+   expansion due to erasure coding
+   share management data (leases)
+   backend (ext3) minimum block size
+
+
 6. XMLRPC (coming soon)
 
   http://127.0.0.1:8123/xmlrpc
diff --git a/src/allmydata/dirnode.py b/src/allmydata/dirnode.py
index 96eb8916..d1e3f86e 100644
--- a/src/allmydata/dirnode.py
+++ b/src/allmydata/dirnode.py
@@ -113,6 +113,8 @@ class NewDirectoryNode:
 
     def __init__(self, client):
         self._client = client
+        self._most_recent_size = None
+
     def __repr__(self):
         return "<%s %s %s>" % (self.__class__.__name__, self.is_readonly() and "RO" or "RW", hasattr(self, '_uri') and self._uri.abbrev())
     def init_from_uri(self, myuri):
@@ -137,8 +139,18 @@ class NewDirectoryNode:
         self._uri = NewDirectoryURI(IMutableFileURI(self._node.get_uri()))
         return self
 
+    def get_size(self):
+        # return the size of our backing mutable file, in bytes, if we've
+        # fetched it.
+        return self._most_recent_size
+
+    def _set_size(self, data):
+        self._most_recent_size = len(data)
+        return data
+
     def _read(self):
         d = self._node.download_best_version()
+        d.addCallback(self._set_size)
         d.addCallback(self._unpack_contents)
         return d
 
@@ -463,6 +475,76 @@ class NewDirectoryNode:
         d.addCallback(_got_list)
         return d
 
+    def deep_stats(self):
+        stats = dict([ (k,0) for k in ["count-immutable-files",
+                                       "count-mutable-files",
+                                       "count-literal-files",
+                                       "count-files",
+                                       "count-directories",
+                                       "size-immutable-files",
+                                       #"size-mutable-files",
+                                       "size-literal-files",
+                                       "size-directories",
+                                       "largest-directory",
+                                       "largest-directory-children",
+                                       "largest-immutable-file",
+                                       #"largest-mutable-file",
+                                       ]])
+        # we track verifier caps, to avoid double-counting children for which
+        # we've got both a write-cap and a read-cap
+        found = set()
+        found.add(self.get_verifier())
+
+        limiter = ConcurrencyLimiter(10)
+
+        d = self._add_deepstats_from_node(self, found, stats, limiter)
+        d.addCallback(lambda res: stats)
+        return d
+
+    def _add_deepstats_from_node(self, node, found, stats, limiter):
+        d = limiter.add(node.list)
+        def _got_list(children):
+            dl = []
+            dirsize_bytes = node.get_size()
+            dirsize_children = len(children)
+            stats["count-directories"] += 1
+            stats["size-directories"] += dirsize_bytes
+            stats["largest-directory"] = max(stats["largest-directory"],
+                                             dirsize_bytes)
+            stats["largest-directory-children"] = max(stats["largest-directory-children"],
+                                                      dirsize_children)
+            for name, (child, metadata) in children.iteritems():
+                verifier = child.get_verifier()
+                if verifier in found:
+                    continue
+                found.add(verifier)
+                if IDirectoryNode.providedBy(child):
+                    dl.append(self._add_deepstats_from_node(child, found,
+                                                            stats, limiter))
+                elif IMutableFileNode.providedBy(child):
+                    stats["count-files"] += 1
+                    stats["count-mutable-files"] += 1
+                    # TODO: update the servermap, compute a size, add it to
+                    # stats["size-mutable-files"], max it into
+                    # stats["largest-mutable-file"]
+                elif IFileNode.providedBy(child): # CHK and LIT
+                    stats["count-files"] += 1
+                    size = child.get_size()
+                    if child.get_uri().startswith("URI:LIT:"):
+                        stats["count-literal-files"] += 1
+                        stats["size-literal-files"] += size
+                    else:
+                        stats["count-immutable-files"] += 1
+                        stats["size-immutable-files"] += size
+                        stats["largest-immutable-file"] = max(
+                            stats["largest-immutable-file"], size)
+            if dl:
+                return defer.DeferredList(dl)
+        d.addCallback(_got_list)
+        return d
+
+
+
 # use client.create_dirnode() to make one of these
 
 
diff --git a/src/allmydata/interfaces.py b/src/allmydata/interfaces.py
index b3dbed39..789ae71a 100644
--- a/src/allmydata/interfaces.py
+++ b/src/allmydata/interfaces.py
@@ -867,8 +867,42 @@ class IDirectoryNode(IMutableFilesystemNode):
         operation finishes. The child name must be a unicode string."""
 
     def build_manifest():
-        """Return a frozenset of verifier-capability strings for all nodes
-        (directories and files) reachable from this one."""
+        """Return a Deferred that fires with a frozenset of
+        verifier-capability strings for all nodes (directories and files)
+        reachable from this one."""
+
+    def deep_stats():
+        """Return a Deferred that fires with a dictionary of statistics
+        computed by examining all nodes (directories and files) reachable
+        from this one, with the following keys::
+
+           count-immutable-files: count of how many CHK files are in the set
+           count-mutable-files: same, for mutable files (does not include
+                                directories)
+           count-literal-files: same, for LIT files
+           count-files: sum of the above three
+
+           count-directories: count of directories
+
+           size-immutable-files: total bytes for all CHK files in the set
+           size-mutable-files (TODO): same, for current version of all mutable
+                                      files, does not include directories
+           size-literal-files: same, for LIT files
+           size-directories: size of mutable files used by directories
+
+           largest-directory: number of bytes in the largest directory
+           largest-directory-children: number of children in the largest
+                                       directory
+           largest-immutable-file: number of bytes in the largest CHK file
+
+        size-mutable-files is not yet implemented, because it would involve
+        even more queries than deep_stats does.
+
+        This operation will visit every directory node underneath this one,
+        and can take a long time to run. On a typical workstation with good
+        bandwidth, this can examine roughly 15 directories per second (and
+        takes several minutes of 100% CPU for ~1700 directories).
+        """
 
 class ICodecEncoder(Interface):
     def set_params(data_size, required_shares, max_shares):
diff --git a/src/allmydata/test/test_dirnode.py b/src/allmydata/test/test_dirnode.py
index 918df62c..83d849d3 100644
--- a/src/allmydata/test/test_dirnode.py
+++ b/src/allmydata/test/test_dirnode.py
@@ -207,6 +207,29 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin):
                                      sorted(self.expected_manifest))
             d.addCallback(_check_manifest)
 
+            d.addCallback(lambda res: n.deep_stats())
+            def _check_deepstats(stats):
+                self.failUnless(isinstance(stats, dict))
+                expected = {"count-immutable-files": 0,
+                            "count-mutable-files": 1,
+                            "count-literal-files": 0,
+                            "count-files": 1,
+                            "count-directories": 2,
+                            "size-immutable-files": 0,
+                            "size-literal-files": 0,
+                            #"size-directories": 616, # varies
+                            #"largest-directory": 616,
+                            "largest-directory-children": 2,
+                            "largest-immutable-file": 0,
+                            }
+                for k,v in expected.iteritems():
+                    self.failUnlessEqual(stats[k], v,
+                                         "stats[%s] was %s, not %s" %
+                                         (k, stats[k], v))
+                self.failUnless(stats["size-directories"] > 600)
+                self.failUnless(stats["largest-directory"] > 600)
+            d.addCallback(_check_deepstats)
+
             def _add_subsubdir(res):
                 return self.subdir.create_empty_directory(u"subsubdir")
             d.addCallback(_add_subsubdir)
diff --git a/src/allmydata/test/test_web.py b/src/allmydata/test/test_web.py
index 0ac61955..8636e56e 100644
--- a/src/allmydata/test/test_web.py
+++ b/src/allmydata/test/test_web.py
@@ -768,6 +768,29 @@ class Web(WebMixin, unittest.TestCase):
         d.addCallback(_got)
         return d
 
+    def test_GET_DIRURL_deepstats(self):
+        d = self.GET(self.public_url + "/foo?t=deep-stats", followRedirect=True)
+        def _got(stats_json):
+            stats = simplejson.loads(stats_json)
+            expected = {"count-immutable-files": 3,
+                        "count-mutable-files": 0,
+                        "count-literal-files": 0,
+                        "count-files": 3,
+                        "count-directories": 3,
+                        "size-immutable-files": 57,
+                        "size-literal-files": 0,
+                        #"size-directories": 1912, # varies
+                        #"largest-directory": 1590,
+                        "largest-directory-children": 5,
+                        "largest-immutable-file": 19,
+                        }
+            for k,v in expected.iteritems():
+                self.failUnlessEqual(stats[k], v,
+                                     "stats[%s] was %s, not %s" %
+                                     (k, stats[k], v))
+        d.addCallback(_got)
+        return d
+
     def test_GET_DIRURL_uri(self):
         d = self.GET(self.public_url + "/foo?t=uri")
         def _check(res):
diff --git a/src/allmydata/webish.py b/src/allmydata/webish.py
index 1ce5ec29..92ce7ea1 100644
--- a/src/allmydata/webish.py
+++ b/src/allmydata/webish.py
@@ -1244,6 +1244,18 @@ class DeepSize(rend.Page):
         d.addCallback(_measure_size)
         return d
 
+class DeepStats(rend.Page):
+
+    def __init__(self, dirnode, dirpath):
+        self._dirnode = dirnode
+        self._dirpath = dirpath
+
+    def renderHTTP(self, ctx):
+        inevow.IRequest(ctx).setHeader("content-type", "text/plain")
+        d = self._dirnode.deep_stats()
+        d.addCallback(simplejson.dumps, indent=1)
+        return d
+
 class ChildError:
     implements(inevow.IResource)
     def renderHTTP(self, ctx):
@@ -1338,6 +1350,8 @@ class VDrive(rend.Page):
                         return Manifest(node, path), ()
                     elif t == "deep-size":
                         return DeepSize(node, path), ()
+                    elif t == "deep-stats":
+                        return DeepStats(node, path), ()
                     elif t == 'rename-form':
                         return RenameForm(self.name, node, path), ()
                     else:
-- 
2.45.2