server.py: get_latencies now reports percentiles _only_ if there are sufficient...

author wilcoxjg <wilcoxjg@gmail.com>

Fri, 27 May 2011 12:01:35 +0000 (05:01 -0700)

committer wilcoxjg <wilcoxjg@gmail.com>

Fri, 27 May 2011 12:01:35 +0000 (05:01 -0700)
author wilcoxjg <wilcoxjg@gmail.com>
Fri, 27 May 2011 12:01:35 +0000 (05:01 -0700)
committer wilcoxjg <wilcoxjg@gmail.com>
Fri, 27 May 2011 12:01:35 +0000 (05:01 -0700)
diff --git a/NEWS.rst b/NEWS.rst

index 26e3f18e52e2c491fde10dc6665989734a48f1e1..3c26d848bec0346784930c50239adf1b60304b0d 100644 (file)
--- a/NEWS.rst
+++ b/NEWS.rst
@@ -1,7 +1,17 @@
-==================================
+==================================
  User-Visible Changes in Tahoe-LAFS
  ==================================
  
+Release 1.9.0 (2011-??-??)
+--------------------------
+
+
+- Nodes now emit "None" for percentiles with higher implied precision
+  than the number of observations can support. Older stats gatherers
+  will throw an exception if they gather stats from a new storage
+  server and it sends a "None" for a percentile. (`#1392`_)
+
+
  Release 1.8.2 (2011-01-30)
  --------------------------
  
diff --git a/docs/stats.rst b/docs/stats.rst

index 681c7687ed47a29b768464a8a124d7059d236341..1a4699d04845f8b4acb3b51c37418e12a48425f7 100644 (file)
--- a/docs/stats.rst
+++ b/docs/stats.rst
@@ -1,4 +1,4 @@
-================
+================
  Tahoe Statistics
  ================
  
@@ -44,7 +44,7 @@ The currently available stats (as of release 1.6.0 or so) are described here:
      this group counts inbound storage-server operations. They are not provided
      by client-only nodes which have been configured to not run a storage server
      (with [storage]enabled=false in tahoe.cfg)
-                           
+
      allocate, write, close, abort
          these are for immutable file uploads. 'allocate' is incremented when a
          client asks if it can upload a share to the server. 'write' is
@@ -134,6 +134,14 @@ The currently available stats (as of release 1.6.0 or so) are described here:
          999 out of the last 1000 operations were faster than the
          given number, and is the same threshold used by Amazon's
          internal SLA, according to the Dynamo paper).
+        Percentiles are only reported in the case of a sufficient
+        number of observations for unambiguous interpretation. For
+        example, the 99.9th percentile is (at the level of thousandths
+        precision) 9 thousandths greater than the 99th
+        percentile for sample sizes greater than or equal to 1000,
+        thus the 99.9th percentile is only reported for samples of 1000
+        or more observations.
+
  
  **counters.uploader.files_uploaded**
  
@@ -195,7 +203,7 @@ The currently available stats (as of release 1.6.0 or so) are described here:
  
      active_uploads
          how many files are currently being uploaded. 0 when idle.
-    
+
      incoming_count
          how many cache files are present in the incoming/ directory,
          which holds ciphertext files that are still being fetched
diff --git a/src/allmydata/interfaces.py b/src/allmydata/interfaces.py

index 430981a947e06ced5ca4578d83f01fb1800a2649..2a9820b0fde9735b60083ec7fe0490bc984efada 100644 (file)
--- a/src/allmydata/interfaces.py
+++ b/src/allmydata/interfaces.py
@@ -2390,12 +2390,12 @@ class RIStatsProvider(RemoteInterface):
      def get_stats():
          """
          returns a dictionary containing 'counters' and 'stats', each a
-        dictionary with string counter/stat name keys, and numeric values.
+        dictionary with string counter/stat name keys, and numeric or None values.
          counters are monotonically increasing measures of work done, and
          stats are instantaneous measures (potentially time averaged
          internally)
          """
-        return DictOf(str, DictOf(str, ChoiceOf(float, int, long)))
+        return DictOf(str, DictOf(str, ChoiceOf(float, int, long, None)))
  
  class RIStatsGatherer(RemoteInterface):
      __remote_name__ = "RIStatsGatherer.tahoe.allmydata.com"
diff --git a/src/allmydata/storage/server.py b/src/allmydata/storage/server.py

index cb58d082b890747f83874ec69cc8240572f40eea..9d93e9905fade819493610b2dfc807b7b118c3f0 100644 (file)
--- a/src/allmydata/storage/server.py
+++ b/src/allmydata/storage/server.py
@@ -116,12 +116,15 @@ class StorageServer(service.MultiService, Referenceable):
  
      def get_latencies(self):
          """Return a dict, indexed by category, that contains a dict of
-        latency numbers for each category. Each dict will contain the
+        latency numbers for each category. If there are sufficient samples
+        for unambiguous interpretation, each dict will contain the
          following keys: mean, 01_0_percentile, 10_0_percentile,
          50_0_percentile (median), 90_0_percentile, 95_0_percentile,
-        99_0_percentile, 99_9_percentile. If no samples have been collected
-        for the given category, then that category name will not be present
-        in the return value."""
+        99_0_percentile, 99_9_percentile.  If there are insufficient
+        samples for a given percentile to be interpreted unambiguously
+        that percentile will be reported as None. If no samples have been
+        collected for the given category, then that category name will
+        not be present in the return value. """
          # note that Amazon's Dynamo paper says they use 99.9% percentile.
          output = {}
          for category in self.latencies:
@@ -129,16 +132,25 @@ class StorageServer(service.MultiService, Referenceable):
                  continue
              stats = {}
              samples = self.latencies[category][:]
-            samples.sort()
              count = len(samples)
-            stats["mean"] = sum(samples) / count
-            stats["01_0_percentile"] = samples[int(0.01 * count)]
-            stats["10_0_percentile"] = samples[int(0.1 * count)]
-            stats["50_0_percentile"] = samples[int(0.5 * count)]
-            stats["90_0_percentile"] = samples[int(0.9 * count)]
-            stats["95_0_percentile"] = samples[int(0.95 * count)]
-            stats["99_0_percentile"] = samples[int(0.99 * count)]
-            stats["99_9_percentile"] = samples[int(0.999 * count)]
+            stats["samplesize"] = count
+            samples.sort()
+            if count > 1:
+                stats["mean"] = sum(samples) / count
+            else:
+                stats["mean"] = None
+
+            orderstatlist = [(0.01, "01_0_percentile", 100), (0.1, "10_0_percentile", 10),\
+                             (0.50, "50_0_percentile", 10), (0.90, "90_0_percentile", 10),\
+                             (0.95, "95_0_percentile", 20), (0.99, "99_0_percentile", 100),\
+                             (0.999, "99_9_percentile", 1000)]
+
+            for percentile, percentilestring, minnumtoobserve in orderstatlist:
+                if count >= minnumtoobserve:
+                    stats[percentilestring] = samples[int(percentile*count)]
+                else:
+                    stats[percentilestring] = None
+
              output[category] = stats
          return output
  
@@ -551,4 +563,3 @@ class StorageServer(service.MultiService, Referenceable):
                  share_type=share_type, si=si_s, shnum=shnum, reason=reason,
                  level=log.SCARY, umid="SGx2fA")
          return None
-
diff --git a/src/allmydata/test/test_storage.py b/src/allmydata/test/test_storage.py

index d63915842a769a9112b75ee70d241a1fa6e1b8e5..afe5824f44eb772edfac3d36a13833ee3d0b474f 100644 (file)
--- a/src/allmydata/test/test_storage.py
+++ b/src/allmydata/test/test_storage.py
@@ -1311,6 +1311,8 @@ class Stats(unittest.TestCase):
              ss.add_latency("allocate", 1.0 * i)
          for i in range(1000):
              ss.add_latency("renew", 1.0 * i)
+        for i in range(20):
+            ss.add_latency("write", 1.0 * i)
          for i in range(10):
              ss.add_latency("cancel", 2.0 * i)
          ss.add_latency("get", 5.0)
@@ -1318,7 +1320,7 @@ class Stats(unittest.TestCase):
          output = ss.get_latencies()
  
          self.failUnlessEqual(sorted(output.keys()),
-                             sorted(["allocate", "renew", "cancel", "get"]))
+                             sorted(["allocate", "renew", "cancel", "write", "get"]))
          self.failUnlessEqual(len(ss.latencies["allocate"]), 1000)
          self.failUnless(abs(output["allocate"]["mean"] - 9500) < 1, output)
          self.failUnless(abs(output["allocate"]["01_0_percentile"] - 9010) < 1, output)
@@ -1339,25 +1341,35 @@ class Stats(unittest.TestCase):
          self.failUnless(abs(output["renew"]["99_0_percentile"] - 990) < 1, output)
          self.failUnless(abs(output["renew"]["99_9_percentile"] - 999) < 1, output)
  
+        self.failUnlessEqual(len(ss.latencies["write"]), 20)
+        self.failUnless(abs(output["write"]["mean"] - 9) < 1, output)
+        self.failUnless(output["write"]["01_0_percentile"] is None, output)
+        self.failUnless(abs(output["write"]["10_0_percentile"] -  2) < 1, output)
+        self.failUnless(abs(output["write"]["50_0_percentile"] - 10) < 1, output)
+        self.failUnless(abs(output["write"]["90_0_percentile"] - 18) < 1, output)
+        self.failUnless(abs(output["write"]["95_0_percentile"] - 19) < 1, output)
+        self.failUnless(output["write"]["99_0_percentile"] is None, output)
+        self.failUnless(output["write"]["99_9_percentile"] is None, output)
+
          self.failUnlessEqual(len(ss.latencies["cancel"]), 10)
          self.failUnless(abs(output["cancel"]["mean"] - 9) < 1, output)
-        self.failUnless(abs(output["cancel"]["01_0_percentile"] -  0) < 1, output)
+        self.failUnless(output["cancel"]["01_0_percentile"] is None, output)
          self.failUnless(abs(output["cancel"]["10_0_percentile"] -  2) < 1, output)
          self.failUnless(abs(output["cancel"]["50_0_percentile"] - 10) < 1, output)
          self.failUnless(abs(output["cancel"]["90_0_percentile"] - 18) < 1, output)
-        self.failUnless(abs(output["cancel"]["95_0_percentile"] - 18) < 1, output)
-        self.failUnless(abs(output["cancel"]["99_0_percentile"] - 18) < 1, output)
-        self.failUnless(abs(output["cancel"]["99_9_percentile"] - 18) < 1, output)
+        self.failUnless(output["cancel"]["95_0_percentile"] is None, output)
+        self.failUnless(output["cancel"]["99_0_percentile"] is None, output)
+        self.failUnless(output["cancel"]["99_9_percentile"] is None, output)
  
          self.failUnlessEqual(len(ss.latencies["get"]), 1)
-        self.failUnless(abs(output["get"]["mean"] - 5) < 1, output)
-        self.failUnless(abs(output["get"]["01_0_percentile"] - 5) < 1, output)
-        self.failUnless(abs(output["get"]["10_0_percentile"] - 5) < 1, output)
-        self.failUnless(abs(output["get"]["50_0_percentile"] - 5) < 1, output)
-        self.failUnless(abs(output["get"]["90_0_percentile"] - 5) < 1, output)
-        self.failUnless(abs(output["get"]["95_0_percentile"] - 5) < 1, output)
-        self.failUnless(abs(output["get"]["99_0_percentile"] - 5) < 1, output)
-        self.failUnless(abs(output["get"]["99_9_percentile"] - 5) < 1, output)
+        self.failUnless(output["get"]["mean"] is None, output)
+        self.failUnless(output["get"]["01_0_percentile"] is None, output)
+        self.failUnless(output["get"]["10_0_percentile"] is None, output)
+        self.failUnless(output["get"]["50_0_percentile"] is None, output)
+        self.failUnless(output["get"]["90_0_percentile"] is None, output)
+        self.failUnless(output["get"]["95_0_percentile"] is None, output)
+        self.failUnless(output["get"]["99_0_percentile"] is None, output)
+        self.failUnless(output["get"]["99_9_percentile"] is None, output)
  
  def remove_tags(s):
      s = re.sub(r'<[^>]*>', ' ', s)
author	wilcoxjg <wilcoxjg@gmail.com>
	Fri, 27 May 2011 12:01:35 +0000 (05:01 -0700)
committer	wilcoxjg <wilcoxjg@gmail.com>
	Fri, 27 May 2011 12:01:35 +0000 (05:01 -0700)
NEWS.rst		patch \| blob \| history
docs/stats.rst		patch \| blob \| history
src/allmydata/interfaces.py		patch \| blob \| history
src/allmydata/storage/server.py		patch \| blob \| history
src/allmydata/test/test_storage.py		patch \| blob \| history