util/statistics: add tests, fix mean_repair_cost

author Brian Warner <warner@lothar.com>

Sun, 15 Feb 2009 23:23:26 +0000 (16:23 -0700)

committer Brian Warner <warner@lothar.com>

Sun, 15 Feb 2009 23:23:26 +0000 (16:23 -0700)
author Brian Warner <warner@lothar.com>
Sun, 15 Feb 2009 23:23:26 +0000 (16:23 -0700)
committer Brian Warner <warner@lothar.com>
Sun, 15 Feb 2009 23:23:26 +0000 (16:23 -0700)
diff --git a/src/allmydata/test/test_util.py b/src/allmydata/test/test_util.py

index 973c798a41e663df65566ded0a24986c76af7e9b..e1e6ab4184d3c151e6eaa39224160e7ad1a37811 100644 (file)
--- a/src/allmydata/test/test_util.py
+++ b/src/allmydata/test/test_util.py
@@ -1,7 +1,8 @@
  
  def foo(): pass # keep the line number constant
  
-import os, time, random
+import os, time
+from StringIO import StringIO
  from twisted.trial import unittest
  from twisted.internet import defer, reactor
  from twisted.python import failure
@@ -206,6 +207,13 @@ class Statistics(unittest.TestCase):
          self.should_assert("Should assert if not 0<=p<=1", f, 1, -1)
          self.should_assert("Should assert if n < 1", f, 0, .1)
  
+        out = StringIO()
+        statistics.print_pmf(pmf_comp, out=out)
+        lines = out.getvalue().splitlines()
+        self.failUnlessEqual(lines[0], "i=0: 0.81")
+        self.failUnlessEqual(lines[1], "i=1: 0.18")
+        self.failUnlessEqual(lines[2], "i=2: 0.01")
+
      def test_survival_pmf(self):
          f = statistics.survival_pmf
          # Cross-check binomial-distribution method against convolution
@@ -217,7 +225,39 @@ class Statistics(unittest.TestCase):
          self.failUnlessTrue(statistics.valid_pmf(pmf1))
          self.should_assert("Should assert if p_i > 1", f, [1.1]);
          self.should_assert("Should assert if p_i < 0", f, [-.1]);
-        
+
+    def test_repair_count_pmf(self):
+        survival_pmf = statistics.binomial_distribution_pmf(5, .9)
+        repair_pmf = statistics.repair_count_pmf(survival_pmf, 3)
+        # repair_pmf[0] == sum(survival_pmf[0,1,2,5])
+        # repair_pmf[1] == survival_pmf[4]
+        # repair_pmf[2] = survival_pmf[3]
+        self.failUnlessListAlmostEqual(repair_pmf,
+                                       [0.00001 + 0.00045 + 0.0081 + 0.59049,
+                                        .32805,
+                                        .0729,
+                                        0, 0, 0])
+
+    def test_repair_cost(self):
+        survival_pmf = statistics.binomial_distribution_pmf(5, .9)
+        bwcost = statistics.bandwidth_cost_function
+        cost = statistics.mean_repair_cost(bwcost, 1000,
+                                           survival_pmf, 3, ul_dl_ratio=1.0)
+        self.failUnlessAlmostEqual(cost, 558.90)
+        cost = statistics.mean_repair_cost(bwcost, 1000,
+                                           survival_pmf, 3, ul_dl_ratio=8.0)
+        self.failUnlessAlmostEqual(cost, 1664.55)
+
+        # I haven't manually checked the math beyond here -warner
+        cost = statistics.eternal_repair_cost(bwcost, 1000,
+                                              survival_pmf, 3,
+                                              discount_rate=0, ul_dl_ratio=1.0)
+        self.failUnlessAlmostEqual(cost, 65292.056074766246)
+        cost = statistics.eternal_repair_cost(bwcost, 1000,
+                                              survival_pmf, 3,
+                                              discount_rate=0.05,
+                                              ul_dl_ratio=1.0)
+        self.failUnlessAlmostEqual(cost, 9133.6097158191551)
  
      def test_convolve(self):
          f = statistics.convolve
@@ -250,7 +290,7 @@ class Statistics(unittest.TestCase):
      def test_find_k(self):
          f = statistics.find_k
          g = statistics.pr_file_loss
-        plist = [.9] * 10 + [.8] * 10
+        plist = [.9] * 10 + [.8] * 10 # N=20
          t = .0001
          k = f(plist, t)
          self.failUnlessEqual(k, 10)
diff --git a/src/allmydata/util/statistics.py b/src/allmydata/util/statistics.py

index 0f9bc3d46d71b293cafb26a2d573639b6e1ab10a..4df023822dbf1f9e360fa36419ae6076a68d0a10 100644 (file)
--- a/src/allmydata/util/statistics.py
+++ b/src/allmydata/util/statistics.py
@@ -4,7 +4,7 @@
  from __future__ import division
  from mathutil import round_sigfigs
  import math
-import array
+import sys
  
  def pr_file_loss(p_list, k):
      """
@@ -87,13 +87,13 @@ def survival_pmf_via_conv(p_list):
      pmf_list = [ [1 - p, p] for p in p_list ];
      return reduce(convolve, pmf_list)
  
-def print_pmf(pmf, n=4):
+def print_pmf(pmf, n=4, out=sys.stdout):
      """
      Print a PMF in a readable form, with values rounded to n
      significant digits. 
      """
      for k, p in enumerate(pmf):
-        print "i=" + str(k) + ":", round_sigfigs(p, n)
+        print >>out, "i=" + str(k) + ":", round_sigfigs(p, n)
  
  def pr_backup_file_loss(p_list, backup_p, k):
      """
@@ -136,6 +136,7 @@ def find_k_from_pmf(pmf, target_loss_prob):
          if loss_prob > target_loss_prob:
              return k
  
+    # we shouldn't be able to get here, since sum(pmf)==1.0
      k = len(pmf) - 1
      return k
  
@@ -166,23 +167,27 @@ def repair_count_pmf(survival_pmf, k):
  def bandwidth_cost_function(file_size, shares, k, ul_dl_ratio):
      return file_size + float(file_size) / k * shares * ul_dl_ratio
  
-def mean_repair_cost(cost_function, file_size, survival_pmf, k):
+def mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio):
      """
      Return the expected cost for a repair run on a file with the given
-    survival_pmf and requiring k shares.
+    survival_pmf and requiring k shares, in which upload cost is
+    'ul_dl_ratio' times download cost.
      """
      repair_pmf = repair_count_pmf(survival_pmf, k)
-    exp_cnt = sum([d * repair_pmf[d] for d in range(1, len(repair_pmf))])
-    return cost_function(file_size, exp_cnt, k)
+    expected_cost = sum([cost_function(file_size, new_shares, k, ul_dl_ratio)
+                         * repair_pmf[new_shares]
+                         for new_shares in range(1, len(repair_pmf))])
+    return expected_cost
  
-def eternal_repair_cost(cost_function, file_size, survival_pmf, k, discount_rate=0):
+def eternal_repair_cost(cost_function, file_size, survival_pmf, k,
+                        discount_rate=0, ul_dl_ratio=1.0):
      """
      Calculate the eternal repair cost for a file that is aggressively
-    repaired.
+    repaired, i.e. the sum of repair costs until the file is dead.
      """
-    c = mean_repair_cost(cost_function, file_size, survival_pmf, k)
+    c = mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio)
      f = 1 - sum(survival_pmf[0:k])
-    r = discount_rate
+    r = float(discount_rate)
  
      return (c * (1-r)) / (1 - (1-r) * f)
  
@@ -259,9 +264,6 @@ def binomial_coeff(n, k):
      """
      assert n >= k
  
-    if k > n:
-        return 0
-
      if k > n/2:
          k = n - k
author	Brian Warner <warner@lothar.com>
	Sun, 15 Feb 2009 23:23:26 +0000 (16:23 -0700)
committer	Brian Warner <warner@lothar.com>
	Sun, 15 Feb 2009 23:23:26 +0000 (16:23 -0700)
src/allmydata/test/test_util.py		patch \| blob \| history
src/allmydata/util/statistics.py		patch \| blob \| history