From: Brian Warner Date: Thu, 8 May 2008 23:19:42 +0000 (-0700) Subject: deep-stats: add file-size histogram X-Git-Tag: allmydata-tahoe-1.1.0~142 X-Git-Url: https://git.rkrishnan.org/architecture.txt?a=commitdiff_plain;h=fabdc28c0631c92ee1573a2671f401ba79a5a159;p=tahoe-lafs%2Ftahoe-lafs.git deep-stats: add file-size histogram --- diff --git a/docs/webapi.txt b/docs/webapi.txt index cebe3e26..5e909792 100644 --- a/docs/webapi.txt +++ b/docs/webapi.txt @@ -585,6 +585,9 @@ GET $URL?t=deep-stats size-mutable-files (TODO): same, for current version of all mutable files size-literal-files: same, for LIT files size-directories: size of directories (includes size-literal-files) + size-files-histogram: list of (minsize, maxsize, count) buckets, + with a histogram of filesizes, 5dB/bucket, + for both literal and immutable files largest-directory: number of children in the largest directory largest-immutable-file: number of bytes in the largest CHK file diff --git a/src/allmydata/dirnode.py b/src/allmydata/dirnode.py index d9db0beb..27e35c54 100644 --- a/src/allmydata/dirnode.py +++ b/src/allmydata/dirnode.py @@ -1,5 +1,5 @@ -import os, time +import os, time, math from zope.interface import implements from twisted.internet import defer @@ -8,7 +8,7 @@ from allmydata.mutable.common import NotMutableError from allmydata.mutable.node import MutableFileNode from allmydata.interfaces import IMutableFileNode, IDirectoryNode,\ IURI, IFileNode, IMutableFileURI, IVerifierURI, IFilesystemNode -from allmydata.util import hashutil +from allmydata.util import hashutil, mathutil from allmydata.util.hashutil import netstring from allmydata.util.limiter import ConcurrencyLimiter from allmydata.uri import NewDirectoryURI @@ -514,6 +514,7 @@ class NewDirectoryNode: elif IFileNode.providedBy(child): # CHK and LIT stats.add("count-files") size = child.get_size() + stats.histogram("size-files-histogram", size) if child.get_uri().startswith("URI:LIT:"): stats.add("count-literal-files") stats.add("size-literal-files", size) @@ -544,6 +545,11 @@ class DeepStats: #"largest-mutable-file", ]: self.stats[k] = 0 + self.histograms = {} + for k in ["size-files-histogram"]: + self.histograms[k] = {} # maps (min,max) to count + self.buckets = [ (0,0), (1,3)] + self.root = math.sqrt(10) def add(self, key, value=1): self.stats[key] += value @@ -551,8 +557,38 @@ class DeepStats: def max(self, key, value): self.stats[key] = max(self.stats[key], value) + def which_bucket(self, size): + # return (min,max) such that min <= size <= max + # values are from the set (0,0), (1,3), (4,10), (11,31), (32,100), + # (101,316), (317, 1000), etc: two per decade + assert size >= 0 + i = 0 + while True: + if i >= len(self.buckets): + # extend the list + new_lower = self.buckets[i-1][1]+1 + new_upper = int(mathutil.next_power_of_k(new_lower, self.root)) + self.buckets.append( (new_lower, new_upper) ) + maybe = self.buckets[i] + if maybe[0] <= size <= maybe[1]: + return maybe + i += 1 + + def histogram(self, key, size): + bucket = self.which_bucket(size) + h = self.histograms[key] + if bucket not in h: + h[bucket] = 0 + h[bucket] += 1 + def get_results(self): - return self.stats + stats = self.stats.copy() + for key in self.histograms: + h = self.histograms[key] + out = [ (bucket[0], bucket[1], h[bucket]) for bucket in h ] + out.sort() + stats[key] = out + return stats # use client.create_dirnode() to make one of these diff --git a/src/allmydata/test/test_dirnode.py b/src/allmydata/test/test_dirnode.py index c07474f8..d9088495 100644 --- a/src/allmydata/test/test_dirnode.py +++ b/src/allmydata/test/test_dirnode.py @@ -166,6 +166,7 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin): d = self.client.create_empty_dirnode() def _then(n): + # / self.failUnless(n.is_mutable()) u = n.get_uri() self.failUnless(u) @@ -186,8 +187,13 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin): assert isinstance(ffu_v, str) self.expected_manifest.append(ffu_v) d.addCallback(lambda res: n.set_uri(u"child", fake_file_uri)) + # / + # /child = mutable d.addCallback(lambda res: n.create_empty_directory(u"subdir")) + # / + # /child = mutable + # /subdir = directory def _created(subdir): self.failUnless(isinstance(subdir, FakeDirectoryNode)) self.subdir = subdir @@ -230,6 +236,7 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin): stats["size-directories"]) self.failUnless(stats["largest-directory"] > 500, stats["largest-directory"]) + self.failUnlessEqual(stats["size-files-histogram"], []) d.addCallback(_check_deepstats) def _add_subsubdir(res): @@ -458,6 +465,49 @@ class Dirnode(unittest.TestCase, testutil.ShouldFailMixin, testutil.StallMixin): return d +class DeepStats(unittest.TestCase): + def test_stats(self): + ds = dirnode.DeepStats() + ds.add("count-files") + ds.add("size-immutable-files", 123) + ds.histogram("size-files-histogram", 123) + ds.max("largest-directory", 444) + + s = ds.get_results() + self.failUnlessEqual(s["count-files"], 1) + self.failUnlessEqual(s["size-immutable-files"], 123) + self.failUnlessEqual(s["largest-directory"], 444) + self.failUnlessEqual(s["count-literal-files"], 0) + + ds.add("count-files") + ds.add("size-immutable-files", 321) + ds.histogram("size-files-histogram", 321) + ds.max("largest-directory", 2) + + s = ds.get_results() + self.failUnlessEqual(s["count-files"], 2) + self.failUnlessEqual(s["size-immutable-files"], 444) + self.failUnlessEqual(s["largest-directory"], 444) + self.failUnlessEqual(s["count-literal-files"], 0) + self.failUnlessEqual(s["size-files-histogram"], + [ (101, 316, 1), (317, 1000, 1) ]) + + ds = dirnode.DeepStats() + for i in range(1, 1100): + ds.histogram("size-files-histogram", i) + ds.histogram("size-files-histogram", 4*1000*1000*1000*1000) # 4TB + s = ds.get_results() + self.failUnlessEqual(s["size-files-histogram"], + [ (1, 3, 3), + (4, 10, 7), + (11, 31, 21), + (32, 100, 69), + (101, 316, 216), + (317, 1000, 684), + (1001, 3162, 99), + (3162277660169L, 10000000000000L, 1), + ]) + netstring = hashutil.netstring split_netstring = dirnode.split_netstring diff --git a/src/allmydata/test/test_system.py b/src/allmydata/test/test_system.py index 3596deaa..60aa97d6 100644 --- a/src/allmydata/test/test_system.py +++ b/src/allmydata/test/test_system.py @@ -1112,6 +1112,31 @@ class SystemTest(testutil.SignalMixin, testutil.PollMixin, testutil.StallMixin, # P/s2-rw/mydata992 (same as P/s2-rw/mydata992) d1.addCallback(lambda manifest: self.failUnlessEqual(len(manifest), 4)) + d1.addCallback(lambda res: home.deep_stats()) + def _check_stats(stats): + expected = {"count-immutable-files": 1, + "count-mutable-files": 0, + "count-literal-files": 1, + "count-files": 2, + "count-directories": 3, + "size-immutable-files": 112, + "size-literal-files": 23, + #"size-directories": 616, # varies + #"largest-directory": 616, + "largest-directory-children": 3, + "largest-immutable-file": 112, + } + for k,v in expected.iteritems(): + self.failUnlessEqual(stats[k], v, + "stats[%s] was %s, not %s" % + (k, stats[k], v)) + self.failUnless(stats["size-directories"] > 1300, + stats["size-directories"]) + self.failUnless(stats["largest-directory"] > 800, + stats["largest-directory"]) + self.failUnlessEqual(stats["size-files-histogram"], + [ (11, 31, 1), (101, 316, 1) ]) + d1.addCallback(_check_stats) return d1 d.addCallback(_got_home) return d diff --git a/src/allmydata/test/test_web.py b/src/allmydata/test/test_web.py index 8636e56e..ca206692 100644 --- a/src/allmydata/test/test_web.py +++ b/src/allmydata/test/test_web.py @@ -788,6 +788,8 @@ class Web(WebMixin, unittest.TestCase): self.failUnlessEqual(stats[k], v, "stats[%s] was %s, not %s" % (k, stats[k], v)) + self.failUnlessEqual(stats["size-files-histogram"], + [ [11, 31, 3] ]) d.addCallback(_got) return d