deep-stats: add file-size histogram
authorBrian Warner <warner@allmydata.com>
Thu, 8 May 2008 23:19:42 +0000 (16:19 -0700)
committerBrian Warner <warner@allmydata.com>
Thu, 8 May 2008 23:19:42 +0000 (16:19 -0700)
docs/webapi.txt
src/allmydata/dirnode.py
src/allmydata/test/test_dirnode.py
src/allmydata/test/test_system.py
src/allmydata/test/test_web.py

index cebe3e267a1c764fa328d6a29ce5c6a8b5df21c2..5e909792e30886be619e06fdfbfff5e8dea25b24 100644 (file)
@@ -585,6 +585,9 @@ GET $URL?t=deep-stats
    size-mutable-files (TODO): same, for current version of all mutable files
    size-literal-files: same, for LIT files
    size-directories: size of directories (includes size-literal-files)
+   size-files-histogram: list of (minsize, maxsize, count) buckets,
+                         with a histogram of filesizes, 5dB/bucket,
+                         for both literal and immutable files
    largest-directory: number of children in the largest directory
    largest-immutable-file: number of bytes in the largest CHK file
 
index d9db0beb314ab7db07e363b7ade15d9f5189b7cc..27e35c547eea50672254f1957d676a82cdf99e98 100644 (file)
@@ -1,5 +1,5 @@
 
-import os, time
+import os, time, math
 
 from zope.interface import implements
 from twisted.internet import defer
@@ -8,7 +8,7 @@ from allmydata.mutable.common import NotMutableError
 from allmydata.mutable.node import MutableFileNode
 from allmydata.interfaces import IMutableFileNode, IDirectoryNode,\
      IURI, IFileNode, IMutableFileURI, IVerifierURI, IFilesystemNode
-from allmydata.util import hashutil
+from allmydata.util import hashutil, mathutil
 from allmydata.util.hashutil import netstring
 from allmydata.util.limiter import ConcurrencyLimiter
 from allmydata.uri import NewDirectoryURI
@@ -514,6 +514,7 @@ class NewDirectoryNode:
                 elif IFileNode.providedBy(child): # CHK and LIT
                     stats.add("count-files")
                     size = child.get_size()
+                    stats.histogram("size-files-histogram", size)
                     if child.get_uri().startswith("URI:LIT:"):
                         stats.add("count-literal-files")
                         stats.add("size-literal-files", size)
@@ -544,6 +545,11 @@ class DeepStats:
                   #"largest-mutable-file",
                   ]:
             self.stats[k] = 0
+        self.histograms = {}
+        for k in ["size-files-histogram"]:
+            self.histograms[k] = {} # maps (min,max) to count
+        self.buckets = [ (0,0), (1,3)]
+        self.root = math.sqrt(10)
 
     def add(self, key, value=1):
         self.stats[key] += value
@@ -551,8 +557,38 @@ class DeepStats:
     def max(self, key, value):
         self.stats[key] = max(self.stats[key], value)
 
+    def which_bucket(self, size):
+        # return (min,max) such that min <= size <= max
+        # values are from the set (0,0), (1,3), (4,10), (11,31), (32,100),
+        # (101,316), (317, 1000), etc: two per decade
+        assert size >= 0
+        i = 0
+        while True:
+            if i >= len(self.buckets):
+                # extend the list
+                new_lower = self.buckets[i-1][1]+1
+                new_upper = int(mathutil.next_power_of_k(new_lower, self.root))
+                self.buckets.append( (new_lower, new_upper) )
+            maybe = self.buckets[i]
+            if maybe[0] <= size <= maybe[1]:
+                return maybe
+            i += 1
+
+    def histogram(self, key, size):
+        bucket = self.which_bucket(size)
+        h = self.histograms[key]
+        if bucket not in h:
+            h[bucket] = 0
+        h[bucket] += 1
+
     def get_results(self):
-        return self.stats
+        stats = self.stats.copy()
+        for key in self.histograms:
+            h = self.histograms[key]
+            out = [ (bucket[0], bucket[1], h[bucket]) for bucket in h ]
+            out.sort()
+            stats[key] = out
+        return stats
 
 
 # use client.create_dirnode() to make one of these
index c07474f85677e67e7a91cbf0a96a3134583f3f1f..d9088495ef6d61af899abe7200c60783b23302d8 100644 (file)
@@ -166,6 +166,7 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin):
 
         d = self.client.create_empty_dirnode()
         def _then(n):
+            # /
             self.failUnless(n.is_mutable())
             u = n.get_uri()
             self.failUnless(u)
@@ -186,8 +187,13 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin):
             assert isinstance(ffu_v, str)
             self.expected_manifest.append(ffu_v)
             d.addCallback(lambda res: n.set_uri(u"child", fake_file_uri))
+            # /
+            # /child = mutable
 
             d.addCallback(lambda res: n.create_empty_directory(u"subdir"))
+            # /
+            # /child = mutable
+            # /subdir = directory
             def _created(subdir):
                 self.failUnless(isinstance(subdir, FakeDirectoryNode))
                 self.subdir = subdir
@@ -230,6 +236,7 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin):
                                 stats["size-directories"])
                 self.failUnless(stats["largest-directory"] > 500,
                                 stats["largest-directory"])
+                self.failUnlessEqual(stats["size-files-histogram"], [])
             d.addCallback(_check_deepstats)
 
             def _add_subsubdir(res):
@@ -458,6 +465,49 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin):
 
         return d
 
+class DeepStats(unittest.TestCase):
+    def test_stats(self):
+        ds = dirnode.DeepStats()
+        ds.add("count-files")
+        ds.add("size-immutable-files", 123)
+        ds.histogram("size-files-histogram", 123)
+        ds.max("largest-directory", 444)
+
+        s = ds.get_results()
+        self.failUnlessEqual(s["count-files"], 1)
+        self.failUnlessEqual(s["size-immutable-files"], 123)
+        self.failUnlessEqual(s["largest-directory"], 444)
+        self.failUnlessEqual(s["count-literal-files"], 0)
+
+        ds.add("count-files")
+        ds.add("size-immutable-files", 321)
+        ds.histogram("size-files-histogram", 321)
+        ds.max("largest-directory", 2)
+
+        s = ds.get_results()
+        self.failUnlessEqual(s["count-files"], 2)
+        self.failUnlessEqual(s["size-immutable-files"], 444)
+        self.failUnlessEqual(s["largest-directory"], 444)
+        self.failUnlessEqual(s["count-literal-files"], 0)
+        self.failUnlessEqual(s["size-files-histogram"],
+                             [ (101, 316, 1), (317, 1000, 1) ])
+
+        ds = dirnode.DeepStats()
+        for i in range(1, 1100):
+            ds.histogram("size-files-histogram", i)
+        ds.histogram("size-files-histogram", 4*1000*1000*1000*1000) # 4TB
+        s = ds.get_results()
+        self.failUnlessEqual(s["size-files-histogram"],
+                             [ (1, 3, 3),
+                               (4, 10, 7),
+                               (11, 31, 21),
+                               (32, 100, 69),
+                               (101, 316, 216),
+                               (317, 1000, 684),
+                               (1001, 3162, 99),
+                               (3162277660169L, 10000000000000L, 1),
+                               ])
+
 
 netstring = hashutil.netstring
 split_netstring = dirnode.split_netstring
index 3596deaaf25f3a402832fb3f8c17b13bb6198a23..60aa97d617ea0d658498c479f5b70dccd54f0b44 100644 (file)
@@ -1112,6 +1112,31 @@ class SystemTest(testutil.SignalMixin, testutil.PollMixin, testutil.StallMixin,
             # P/s2-rw/mydata992 (same as P/s2-rw/mydata992)
             d1.addCallback(lambda manifest:
                            self.failUnlessEqual(len(manifest), 4))
+            d1.addCallback(lambda res: home.deep_stats())
+            def _check_stats(stats):
+                expected = {"count-immutable-files": 1,
+                            "count-mutable-files": 0,
+                            "count-literal-files": 1,
+                            "count-files": 2,
+                            "count-directories": 3,
+                            "size-immutable-files": 112,
+                            "size-literal-files": 23,
+                            #"size-directories": 616, # varies
+                            #"largest-directory": 616,
+                            "largest-directory-children": 3,
+                            "largest-immutable-file": 112,
+                            }
+                for k,v in expected.iteritems():
+                    self.failUnlessEqual(stats[k], v,
+                                         "stats[%s] was %s, not %s" %
+                                         (k, stats[k], v))
+                self.failUnless(stats["size-directories"] > 1300,
+                                stats["size-directories"])
+                self.failUnless(stats["largest-directory"] > 800,
+                                stats["largest-directory"])
+                self.failUnlessEqual(stats["size-files-histogram"],
+                                     [ (11, 31, 1), (101, 316, 1) ])
+            d1.addCallback(_check_stats)
             return d1
         d.addCallback(_got_home)
         return d
index 8636e56e4201e01ae4e870f8782e60cd45b5a0af..ca20669292359da63807cfde095a784c5b72ddbb 100644 (file)
@@ -788,6 +788,8 @@ class Web(WebMixin, unittest.TestCase):
                 self.failUnlessEqual(stats[k], v,
                                      "stats[%s] was %s, not %s" %
                                      (k, stats[k], v))
+            self.failUnlessEqual(stats["size-files-histogram"],
+                                 [ [11, 31, 3] ])
         d.addCallback(_got)
         return d