server.py: get_latencies now reports percentiles _only_ if there are sufficient...

author wilcoxjg <wilcoxjg@gmail.com>

Fri, 27 May 2011 12:01:35 +0000 (05:01 -0700)

committer wilcoxjg <wilcoxjg@gmail.com>

Fri, 27 May 2011 12:01:35 +0000 (05:01 -0700)
author wilcoxjg <wilcoxjg@gmail.com>
Fri, 27 May 2011 12:01:35 +0000 (05:01 -0700)
committer wilcoxjg <wilcoxjg@gmail.com>
Fri, 27 May 2011 12:01:35 +0000 (05:01 -0700)
diff --git a/NEWS.rst b/NEWS.rst

index 26e3f18e52e2c491fde10dc6665989734a48f1e1..3c26d848bec0346784930c50239adf1b60304b0d 100644 (file)
--- a/NEWS.rst
+++ b/NEWS.rst
@@ -1,7 +1,17 @@
-==================================
+==================================
  User-Visible Changes in Tahoe-LAFS
  ==================================
  
  User-Visible Changes in Tahoe-LAFS
  ==================================
  
+Release 1.9.0 (2011-??-??)
+--------------------------
+
+
+- Nodes now emit "None" for percentiles with higher implied precision
+  than the number of observations can support. Older stats gatherers
+  will throw an exception if they gather stats from a new storage
+  server and it sends a "None" for a percentile. (`#1392`_)
+
+
  Release 1.8.2 (2011-01-30)
  --------------------------
  
  Release 1.8.2 (2011-01-30)
  --------------------------
  
diff --git a/docs/stats.rst b/docs/stats.rst

index 681c7687ed47a29b768464a8a124d7059d236341..1a4699d04845f8b4acb3b51c37418e12a48425f7 100644 (file)
--- a/docs/stats.rst
+++ b/docs/stats.rst
@@ -1,4 +1,4 @@
-================
+================
  Tahoe Statistics
  ================
  
  Tahoe Statistics
  ================
  
@@ -44,7 +44,7 @@ The currently available stats (as of release 1.6.0 or so) are described here:
      this group counts inbound storage-server operations. They are not provided
      by client-only nodes which have been configured to not run a storage server
      (with [storage]enabled=false in tahoe.cfg)
      this group counts inbound storage-server operations. They are not provided
      by client-only nodes which have been configured to not run a storage server
      (with [storage]enabled=false in tahoe.cfg)
-                           
+
      allocate, write, close, abort
          these are for immutable file uploads. 'allocate' is incremented when a
          client asks if it can upload a share to the server. 'write' is
      allocate, write, close, abort
          these are for immutable file uploads. 'allocate' is incremented when a
          client asks if it can upload a share to the server. 'write' is
@@ -134,6 +134,14 @@ The currently available stats (as of release 1.6.0 or so) are described here:
          999 out of the last 1000 operations were faster than the
          given number, and is the same threshold used by Amazon's
          internal SLA, according to the Dynamo paper).
          999 out of the last 1000 operations were faster than the
          given number, and is the same threshold used by Amazon's
          internal SLA, according to the Dynamo paper).
+        Percentiles are only reported in the case of a sufficient
+        number of observations for unambiguous interpretation. For
+        example, the 99.9th percentile is (at the level of thousandths
+        precision) 9 thousandths greater than the 99th
+        percentile for sample sizes greater than or equal to 1000,
+        thus the 99.9th percentile is only reported for samples of 1000
+        or more observations.
+
  
  **counters.uploader.files_uploaded**
  
  
  **counters.uploader.files_uploaded**
  
@@ -195,7 +203,7 @@ The currently available stats (as of release 1.6.0 or so) are described here:
  
      active_uploads
          how many files are currently being uploaded. 0 when idle.
  
      active_uploads
          how many files are currently being uploaded. 0 when idle.
-    
+
      incoming_count
          how many cache files are present in the incoming/ directory,
          which holds ciphertext files that are still being fetched
      incoming_count
          how many cache files are present in the incoming/ directory,
          which holds ciphertext files that are still being fetched
diff --git a/src/allmydata/interfaces.py b/src/allmydata/interfaces.py

index 430981a947e06ced5ca4578d83f01fb1800a2649..2a9820b0fde9735b60083ec7fe0490bc984efada 100644 (file)
--- a/src/allmydata/interfaces.py
+++ b/src/allmydata/interfaces.py
@@ -2390,12 +2390,12 @@ class RIStatsProvider(RemoteInterface):
      def get_stats():
          """
          returns a dictionary containing 'counters' and 'stats', each a
      def get_stats():
          """
          returns a dictionary containing 'counters' and 'stats', each a
-        dictionary with string counter/stat name keys, and numeric values.
+        dictionary with string counter/stat name keys, and numeric or None values.
          counters are monotonically increasing measures of work done, and
          stats are instantaneous measures (potentially time averaged
          internally)
          """
          counters are monotonically increasing measures of work done, and
          stats are instantaneous measures (potentially time averaged
          internally)
          """
-        return DictOf(str, DictOf(str, ChoiceOf(float, int, long)))
+        return DictOf(str, DictOf(str, ChoiceOf(float, int, long, None)))
  
  class RIStatsGatherer(RemoteInterface):
      __remote_name__ = "RIStatsGatherer.tahoe.allmydata.com"
  
  class RIStatsGatherer(RemoteInterface):
      __remote_name__ = "RIStatsGatherer.tahoe.allmydata.com"
diff --git a/src/allmydata/storage/server.py b/src/allmydata/storage/server.py

index cb58d082b890747f83874ec69cc8240572f40eea..9d93e9905fade819493610b2dfc807b7b118c3f0 100644 (file)
--- a/src/allmydata/storage/server.py
+++ b/src/allmydata/storage/server.py
@@ -116,12 +116,15 @@ class StorageServer(service.MultiService, Referenceable):
  
      def get_latencies(self):
          """Return a dict, indexed by category, that contains a dict of
  
      def get_latencies(self):
          """Return a dict, indexed by category, that contains a dict of
-        latency numbers for each category. Each dict will contain the
+        latency numbers for each category. If there are sufficient samples
+        for unambiguous interpretation, each dict will contain the
          following keys: mean, 01_0_percentile, 10_0_percentile,
          50_0_percentile (median), 90_0_percentile, 95_0_percentile,
          following keys: mean, 01_0_percentile, 10_0_percentile,
          50_0_percentile (median), 90_0_percentile, 95_0_percentile,
-        99_0_percentile, 99_9_percentile. If no samples have been collected
-        for the given category, then that category name will not be present
-        in the return value."""
+        99_0_percentile, 99_9_percentile.  If there are insufficient
+        samples for a given percentile to be interpreted unambiguously
+        that percentile will be reported as None. If no samples have been
+        collected for the given category, then that category name will
+        not be present in the return value. """
          # note that Amazon's Dynamo paper says they use 99.9% percentile.
          output = {}
          for category in self.latencies:
          # note that Amazon's Dynamo paper says they use 99.9% percentile.
          output = {}
          for category in self.latencies:
@@ -129,16 +132,25 @@ class StorageServer(service.MultiService, Referenceable):
                  continue
              stats = {}
              samples = self.latencies[category][:]
                  continue
              stats = {}
              samples = self.latencies[category][:]
-            samples.sort()
              count = len(samples)
              count = len(samples)
-            stats["mean"] = sum(samples) / count
-            stats["01_0_percentile"] = samples[int(0.01 * count)]
-            stats["10_0_percentile"] = samples[int(0.1 * count)]
-            stats["50_0_percentile"] = samples[int(0.5 * count)]
-            stats["90_0_percentile"] = samples[int(0.9 * count)]
-            stats["95_0_percentile"] = samples[int(0.95 * count)]
-            stats["99_0_percentile"] = samples[int(0.99 * count)]
-            stats["99_9_percentile"] = samples[int(0.999 * count)]
+            stats["samplesize"] = count
+            samples.sort()
+            if count > 1:
+                stats["mean"] = sum(samples) / count
+            else:
+                stats["mean"] = None
+
+            orderstatlist = [(0.01, "01_0_percentile", 100), (0.1, "10_0_percentile", 10),\
+                             (0.50, "50_0_percentile", 10), (0.90, "90_0_percentile", 10),\
+                             (0.95, "95_0_percentile", 20), (0.99, "99_0_percentile", 100),\
+                             (0.999, "99_9_percentile", 1000)]
+
+            for percentile, percentilestring, minnumtoobserve in orderstatlist:
+                if count >= minnumtoobserve:
+                    stats[percentilestring] = samples[int(percentile*count)]
+                else:
+                    stats[percentilestring] = None
+
              output[category] = stats
          return output
  
              output[category] = stats
          return output
  
@@ -551,4 +563,3 @@ class StorageServer(service.MultiService, Referenceable):
                  share_type=share_type, si=si_s, shnum=shnum, reason=reason,
                  level=log.SCARY, umid="SGx2fA")
          return None
                  share_type=share_type, si=si_s, shnum=shnum, reason=reason,
                  level=log.SCARY, umid="SGx2fA")
          return None
-
diff --git a/src/allmydata/test/test_storage.py b/src/allmydata/test/test_storage.py

index d63915842a769a9112b75ee70d241a1fa6e1b8e5..afe5824f44eb772edfac3d36a13833ee3d0b474f 100644 (file)
--- a/src/allmydata/test/test_storage.py
+++ b/src/allmydata/test/test_storage.py
@@ -1311,6 +1311,8 @@ class Stats(unittest.TestCase):
              ss.add_latency("allocate", 1.0 * i)
          for i in range(1000):
              ss.add_latency("renew", 1.0 * i)
              ss.add_latency("allocate", 1.0 * i)
          for i in range(1000):
              ss.add_latency("renew", 1.0 * i)
+        for i in range(20):
+            ss.add_latency("write", 1.0 * i)
          for i in range(10):
              ss.add_latency("cancel", 2.0 * i)
          ss.add_latency("get", 5.0)
          for i in range(10):
              ss.add_latency("cancel", 2.0 * i)
          ss.add_latency("get", 5.0)
@@ -1318,7 +1320,7 @@ class Stats(unittest.TestCase):
          output = ss.get_latencies()
  
          self.failUnlessEqual(sorted(output.keys()),
          output = ss.get_latencies()
  
          self.failUnlessEqual(sorted(output.keys()),
-                             sorted(["allocate", "renew", "cancel", "get"]))
+                             sorted(["allocate", "renew", "cancel", "write", "get"]))
          self.failUnlessEqual(len(ss.latencies["allocate"]), 1000)
          self.failUnless(abs(output["allocate"]["mean"] - 9500) < 1, output)
          self.failUnless(abs(output["allocate"]["01_0_percentile"] - 9010) < 1, output)
          self.failUnlessEqual(len(ss.latencies["allocate"]), 1000)
          self.failUnless(abs(output["allocate"]["mean"] - 9500) < 1, output)
          self.failUnless(abs(output["allocate"]["01_0_percentile"] - 9010) < 1, output)
@@ -1339,25 +1341,35 @@ class Stats(unittest.TestCase):
          self.failUnless(abs(output["renew"]["99_0_percentile"] - 990) < 1, output)
          self.failUnless(abs(output["renew"]["99_9_percentile"] - 999) < 1, output)
  
          self.failUnless(abs(output["renew"]["99_0_percentile"] - 990) < 1, output)
          self.failUnless(abs(output["renew"]["99_9_percentile"] - 999) < 1, output)
  
+        self.failUnlessEqual(len(ss.latencies["write"]), 20)
+        self.failUnless(abs(output["write"]["mean"] - 9) < 1, output)
+        self.failUnless(output["write"]["01_0_percentile"] is None, output)
+        self.failUnless(abs(output["write"]["10_0_percentile"] -  2) < 1, output)
+        self.failUnless(abs(output["write"]["50_0_percentile"] - 10) < 1, output)
+        self.failUnless(abs(output["write"]["90_0_percentile"] - 18) < 1, output)
+        self.failUnless(abs(output["write"]["95_0_percentile"] - 19) < 1, output)
+        self.failUnless(output["write"]["99_0_percentile"] is None, output)
+        self.failUnless(output["write"]["99_9_percentile"] is None, output)
+
          self.failUnlessEqual(len(ss.latencies["cancel"]), 10)
          self.failUnless(abs(output["cancel"]["mean"] - 9) < 1, output)
          self.failUnlessEqual(len(ss.latencies["cancel"]), 10)
          self.failUnless(abs(output["cancel"]["mean"] - 9) < 1, output)
-        self.failUnless(abs(output["cancel"]["01_0_percentile"] -  0) < 1, output)
+        self.failUnless(output["cancel"]["01_0_percentile"] is None, output)
          self.failUnless(abs(output["cancel"]["10_0_percentile"] -  2) < 1, output)
          self.failUnless(abs(output["cancel"]["50_0_percentile"] - 10) < 1, output)
          self.failUnless(abs(output["cancel"]["90_0_percentile"] - 18) < 1, output)
          self.failUnless(abs(output["cancel"]["10_0_percentile"] -  2) < 1, output)
          self.failUnless(abs(output["cancel"]["50_0_percentile"] - 10) < 1, output)
          self.failUnless(abs(output["cancel"]["90_0_percentile"] - 18) < 1, output)
-        self.failUnless(abs(output["cancel"]["95_0_percentile"] - 18) < 1, output)
-        self.failUnless(abs(output["cancel"]["99_0_percentile"] - 18) < 1, output)
-        self.failUnless(abs(output["cancel"]["99_9_percentile"] - 18) < 1, output)
+        self.failUnless(output["cancel"]["95_0_percentile"] is None, output)
+        self.failUnless(output["cancel"]["99_0_percentile"] is None, output)
+        self.failUnless(output["cancel"]["99_9_percentile"] is None, output)
  
          self.failUnlessEqual(len(ss.latencies["get"]), 1)
  
          self.failUnlessEqual(len(ss.latencies["get"]), 1)
-        self.failUnless(abs(output["get"]["mean"] - 5) < 1, output)
-        self.failUnless(abs(output["get"]["01_0_percentile"] - 5) < 1, output)
-        self.failUnless(abs(output["get"]["10_0_percentile"] - 5) < 1, output)
-        self.failUnless(abs(output["get"]["50_0_percentile"] - 5) < 1, output)
-        self.failUnless(abs(output["get"]["90_0_percentile"] - 5) < 1, output)
-        self.failUnless(abs(output["get"]["95_0_percentile"] - 5) < 1, output)
-        self.failUnless(abs(output["get"]["99_0_percentile"] - 5) < 1, output)
-        self.failUnless(abs(output["get"]["99_9_percentile"] - 5) < 1, output)
+        self.failUnless(output["get"]["mean"] is None, output)
+        self.failUnless(output["get"]["01_0_percentile"] is None, output)
+        self.failUnless(output["get"]["10_0_percentile"] is None, output)
+        self.failUnless(output["get"]["50_0_percentile"] is None, output)
+        self.failUnless(output["get"]["90_0_percentile"] is None, output)
+        self.failUnless(output["get"]["95_0_percentile"] is None, output)
+        self.failUnless(output["get"]["99_0_percentile"] is None, output)
+        self.failUnless(output["get"]["99_9_percentile"] is None, output)
  
  def remove_tags(s):
      s = re.sub(r'<[^>]*>', ' ', s)
  
  def remove_tags(s):
      s = re.sub(r'<[^>]*>', ' ', s)
author	wilcoxjg <wilcoxjg@gmail.com>
	Fri, 27 May 2011 12:01:35 +0000 (05:01 -0700)
committer	wilcoxjg <wilcoxjg@gmail.com>
	Fri, 27 May 2011 12:01:35 +0000 (05:01 -0700)
NEWS.rst		patch \| blob \| history
docs/stats.rst		patch \| blob \| history
src/allmydata/interfaces.py		patch \| blob \| history
src/allmydata/storage/server.py		patch \| blob \| history
src/allmydata/test/test_storage.py		patch \| blob \| history