From 67ad0175cd3e48703b81737abdcf531d167e8daa Mon Sep 17 00:00:00 2001 From: wilcoxjg <wilcoxjg@gmail.com> Date: Fri, 27 May 2011 05:01:35 -0700 Subject: [PATCH] server.py: get_latencies now reports percentiles _only_ if there are sufficient observations for the interpretation of the percentile to be unambiguous. interfaces.py: modified the return type of RIStatsProvider.get_stats to allow for None as a return value NEWS.rst, stats.py: documentation of change to get_latencies stats.rst: now documents percentile modification in get_latencies test_storage.py: test_latencies now expects None in output categories that contain too few samples for the associated percentile to be unambiguously reported. fixes #1392 --- NEWS.rst | 12 ++++++++- docs/stats.rst | 14 ++++++++--- src/allmydata/interfaces.py | 4 +-- src/allmydata/storage/server.py | 39 +++++++++++++++++++----------- src/allmydata/test/test_storage.py | 38 +++++++++++++++++++---------- 5 files changed, 74 insertions(+), 33 deletions(-) diff --git a/NEWS.rst b/NEWS.rst index 26e3f18e..3c26d848 100644 --- a/NEWS.rst +++ b/NEWS.rst @@ -1,7 +1,17 @@ -================================== +================================== User-Visible Changes in Tahoe-LAFS ================================== +Release 1.9.0 (2011-??-??) +-------------------------- + + +- Nodes now emit "None" for percentiles with higher implied precision + than the number of observations can support. Older stats gatherers + will throw an exception if they gather stats from a new storage + server and it sends a "None" for a percentile. (`#1392`_) + + Release 1.8.2 (2011-01-30) -------------------------- diff --git a/docs/stats.rst b/docs/stats.rst index 681c7687..1a4699d0 100644 --- a/docs/stats.rst +++ b/docs/stats.rst @@ -1,4 +1,4 @@ -================ +================ Tahoe Statistics ================ @@ -44,7 +44,7 @@ The currently available stats (as of release 1.6.0 or so) are described here: this group counts inbound storage-server operations. They are not provided by client-only nodes which have been configured to not run a storage server (with [storage]enabled=false in tahoe.cfg) - + allocate, write, close, abort these are for immutable file uploads. 'allocate' is incremented when a client asks if it can upload a share to the server. 'write' is @@ -134,6 +134,14 @@ The currently available stats (as of release 1.6.0 or so) are described here: 999 out of the last 1000 operations were faster than the given number, and is the same threshold used by Amazon's internal SLA, according to the Dynamo paper). + Percentiles are only reported in the case of a sufficient + number of observations for unambiguous interpretation. For + example, the 99.9th percentile is (at the level of thousandths + precision) 9 thousandths greater than the 99th + percentile for sample sizes greater than or equal to 1000, + thus the 99.9th percentile is only reported for samples of 1000 + or more observations. + **counters.uploader.files_uploaded** @@ -195,7 +203,7 @@ The currently available stats (as of release 1.6.0 or so) are described here: active_uploads how many files are currently being uploaded. 0 when idle. - + incoming_count how many cache files are present in the incoming/ directory, which holds ciphertext files that are still being fetched diff --git a/src/allmydata/interfaces.py b/src/allmydata/interfaces.py index 430981a9..2a9820b0 100644 --- a/src/allmydata/interfaces.py +++ b/src/allmydata/interfaces.py @@ -2390,12 +2390,12 @@ class RIStatsProvider(RemoteInterface): def get_stats(): """ returns a dictionary containing 'counters' and 'stats', each a - dictionary with string counter/stat name keys, and numeric values. + dictionary with string counter/stat name keys, and numeric or None values. counters are monotonically increasing measures of work done, and stats are instantaneous measures (potentially time averaged internally) """ - return DictOf(str, DictOf(str, ChoiceOf(float, int, long))) + return DictOf(str, DictOf(str, ChoiceOf(float, int, long, None))) class RIStatsGatherer(RemoteInterface): __remote_name__ = "RIStatsGatherer.tahoe.allmydata.com" diff --git a/src/allmydata/storage/server.py b/src/allmydata/storage/server.py index cb58d082..9d93e990 100644 --- a/src/allmydata/storage/server.py +++ b/src/allmydata/storage/server.py @@ -116,12 +116,15 @@ class StorageServer(service.MultiService, Referenceable): def get_latencies(self): """Return a dict, indexed by category, that contains a dict of - latency numbers for each category. Each dict will contain the + latency numbers for each category. If there are sufficient samples + for unambiguous interpretation, each dict will contain the following keys: mean, 01_0_percentile, 10_0_percentile, 50_0_percentile (median), 90_0_percentile, 95_0_percentile, - 99_0_percentile, 99_9_percentile. If no samples have been collected - for the given category, then that category name will not be present - in the return value.""" + 99_0_percentile, 99_9_percentile. If there are insufficient + samples for a given percentile to be interpreted unambiguously + that percentile will be reported as None. If no samples have been + collected for the given category, then that category name will + not be present in the return value. """ # note that Amazon's Dynamo paper says they use 99.9% percentile. output = {} for category in self.latencies: @@ -129,16 +132,25 @@ class StorageServer(service.MultiService, Referenceable): continue stats = {} samples = self.latencies[category][:] - samples.sort() count = len(samples) - stats["mean"] = sum(samples) / count - stats["01_0_percentile"] = samples[int(0.01 * count)] - stats["10_0_percentile"] = samples[int(0.1 * count)] - stats["50_0_percentile"] = samples[int(0.5 * count)] - stats["90_0_percentile"] = samples[int(0.9 * count)] - stats["95_0_percentile"] = samples[int(0.95 * count)] - stats["99_0_percentile"] = samples[int(0.99 * count)] - stats["99_9_percentile"] = samples[int(0.999 * count)] + stats["samplesize"] = count + samples.sort() + if count > 1: + stats["mean"] = sum(samples) / count + else: + stats["mean"] = None + + orderstatlist = [(0.01, "01_0_percentile", 100), (0.1, "10_0_percentile", 10),\ + (0.50, "50_0_percentile", 10), (0.90, "90_0_percentile", 10),\ + (0.95, "95_0_percentile", 20), (0.99, "99_0_percentile", 100),\ + (0.999, "99_9_percentile", 1000)] + + for percentile, percentilestring, minnumtoobserve in orderstatlist: + if count >= minnumtoobserve: + stats[percentilestring] = samples[int(percentile*count)] + else: + stats[percentilestring] = None + output[category] = stats return output @@ -551,4 +563,3 @@ class StorageServer(service.MultiService, Referenceable): share_type=share_type, si=si_s, shnum=shnum, reason=reason, level=log.SCARY, umid="SGx2fA") return None - diff --git a/src/allmydata/test/test_storage.py b/src/allmydata/test/test_storage.py index d6391584..afe5824f 100644 --- a/src/allmydata/test/test_storage.py +++ b/src/allmydata/test/test_storage.py @@ -1311,6 +1311,8 @@ class Stats(unittest.TestCase): ss.add_latency("allocate", 1.0 * i) for i in range(1000): ss.add_latency("renew", 1.0 * i) + for i in range(20): + ss.add_latency("write", 1.0 * i) for i in range(10): ss.add_latency("cancel", 2.0 * i) ss.add_latency("get", 5.0) @@ -1318,7 +1320,7 @@ class Stats(unittest.TestCase): output = ss.get_latencies() self.failUnlessEqual(sorted(output.keys()), - sorted(["allocate", "renew", "cancel", "get"])) + sorted(["allocate", "renew", "cancel", "write", "get"])) self.failUnlessEqual(len(ss.latencies["allocate"]), 1000) self.failUnless(abs(output["allocate"]["mean"] - 9500) < 1, output) self.failUnless(abs(output["allocate"]["01_0_percentile"] - 9010) < 1, output) @@ -1339,25 +1341,35 @@ class Stats(unittest.TestCase): self.failUnless(abs(output["renew"]["99_0_percentile"] - 990) < 1, output) self.failUnless(abs(output["renew"]["99_9_percentile"] - 999) < 1, output) + self.failUnlessEqual(len(ss.latencies["write"]), 20) + self.failUnless(abs(output["write"]["mean"] - 9) < 1, output) + self.failUnless(output["write"]["01_0_percentile"] is None, output) + self.failUnless(abs(output["write"]["10_0_percentile"] - 2) < 1, output) + self.failUnless(abs(output["write"]["50_0_percentile"] - 10) < 1, output) + self.failUnless(abs(output["write"]["90_0_percentile"] - 18) < 1, output) + self.failUnless(abs(output["write"]["95_0_percentile"] - 19) < 1, output) + self.failUnless(output["write"]["99_0_percentile"] is None, output) + self.failUnless(output["write"]["99_9_percentile"] is None, output) + self.failUnlessEqual(len(ss.latencies["cancel"]), 10) self.failUnless(abs(output["cancel"]["mean"] - 9) < 1, output) - self.failUnless(abs(output["cancel"]["01_0_percentile"] - 0) < 1, output) + self.failUnless(output["cancel"]["01_0_percentile"] is None, output) self.failUnless(abs(output["cancel"]["10_0_percentile"] - 2) < 1, output) self.failUnless(abs(output["cancel"]["50_0_percentile"] - 10) < 1, output) self.failUnless(abs(output["cancel"]["90_0_percentile"] - 18) < 1, output) - self.failUnless(abs(output["cancel"]["95_0_percentile"] - 18) < 1, output) - self.failUnless(abs(output["cancel"]["99_0_percentile"] - 18) < 1, output) - self.failUnless(abs(output["cancel"]["99_9_percentile"] - 18) < 1, output) + self.failUnless(output["cancel"]["95_0_percentile"] is None, output) + self.failUnless(output["cancel"]["99_0_percentile"] is None, output) + self.failUnless(output["cancel"]["99_9_percentile"] is None, output) self.failUnlessEqual(len(ss.latencies["get"]), 1) - self.failUnless(abs(output["get"]["mean"] - 5) < 1, output) - self.failUnless(abs(output["get"]["01_0_percentile"] - 5) < 1, output) - self.failUnless(abs(output["get"]["10_0_percentile"] - 5) < 1, output) - self.failUnless(abs(output["get"]["50_0_percentile"] - 5) < 1, output) - self.failUnless(abs(output["get"]["90_0_percentile"] - 5) < 1, output) - self.failUnless(abs(output["get"]["95_0_percentile"] - 5) < 1, output) - self.failUnless(abs(output["get"]["99_0_percentile"] - 5) < 1, output) - self.failUnless(abs(output["get"]["99_9_percentile"] - 5) < 1, output) + self.failUnless(output["get"]["mean"] is None, output) + self.failUnless(output["get"]["01_0_percentile"] is None, output) + self.failUnless(output["get"]["10_0_percentile"] is None, output) + self.failUnless(output["get"]["50_0_percentile"] is None, output) + self.failUnless(output["get"]["90_0_percentile"] is None, output) + self.failUnless(output["get"]["95_0_percentile"] is None, output) + self.failUnless(output["get"]["99_0_percentile"] is None, output) + self.failUnless(output["get"]["99_9_percentile"] is None, output) def remove_tags(s): s = re.sub(r'<[^>]*>', ' ', s) -- 2.45.2