From: Brian Warner Date: Sun, 15 Feb 2009 23:23:26 +0000 (-0700) Subject: util/statistics: add tests, fix mean_repair_cost X-Git-Tag: allmydata-tahoe-1.4.0~222 X-Git-Url: https://git.rkrishnan.org/pf/content/en/seg?a=commitdiff_plain;h=e1380b132bd6628d85471978c65b3de5e4b921eb;p=tahoe-lafs%2Ftahoe-lafs.git util/statistics: add tests, fix mean_repair_cost --- diff --git a/src/allmydata/test/test_util.py b/src/allmydata/test/test_util.py index 973c798a..e1e6ab41 100644 --- a/src/allmydata/test/test_util.py +++ b/src/allmydata/test/test_util.py @@ -1,7 +1,8 @@ def foo(): pass # keep the line number constant -import os, time, random +import os, time +from StringIO import StringIO from twisted.trial import unittest from twisted.internet import defer, reactor from twisted.python import failure @@ -206,6 +207,13 @@ class Statistics(unittest.TestCase): self.should_assert("Should assert if not 0<=p<=1", f, 1, -1) self.should_assert("Should assert if n < 1", f, 0, .1) + out = StringIO() + statistics.print_pmf(pmf_comp, out=out) + lines = out.getvalue().splitlines() + self.failUnlessEqual(lines[0], "i=0: 0.81") + self.failUnlessEqual(lines[1], "i=1: 0.18") + self.failUnlessEqual(lines[2], "i=2: 0.01") + def test_survival_pmf(self): f = statistics.survival_pmf # Cross-check binomial-distribution method against convolution @@ -217,7 +225,39 @@ class Statistics(unittest.TestCase): self.failUnlessTrue(statistics.valid_pmf(pmf1)) self.should_assert("Should assert if p_i > 1", f, [1.1]); self.should_assert("Should assert if p_i < 0", f, [-.1]); - + + def test_repair_count_pmf(self): + survival_pmf = statistics.binomial_distribution_pmf(5, .9) + repair_pmf = statistics.repair_count_pmf(survival_pmf, 3) + # repair_pmf[0] == sum(survival_pmf[0,1,2,5]) + # repair_pmf[1] == survival_pmf[4] + # repair_pmf[2] = survival_pmf[3] + self.failUnlessListAlmostEqual(repair_pmf, + [0.00001 + 0.00045 + 0.0081 + 0.59049, + .32805, + .0729, + 0, 0, 0]) + + def test_repair_cost(self): + survival_pmf = statistics.binomial_distribution_pmf(5, .9) + bwcost = statistics.bandwidth_cost_function + cost = statistics.mean_repair_cost(bwcost, 1000, + survival_pmf, 3, ul_dl_ratio=1.0) + self.failUnlessAlmostEqual(cost, 558.90) + cost = statistics.mean_repair_cost(bwcost, 1000, + survival_pmf, 3, ul_dl_ratio=8.0) + self.failUnlessAlmostEqual(cost, 1664.55) + + # I haven't manually checked the math beyond here -warner + cost = statistics.eternal_repair_cost(bwcost, 1000, + survival_pmf, 3, + discount_rate=0, ul_dl_ratio=1.0) + self.failUnlessAlmostEqual(cost, 65292.056074766246) + cost = statistics.eternal_repair_cost(bwcost, 1000, + survival_pmf, 3, + discount_rate=0.05, + ul_dl_ratio=1.0) + self.failUnlessAlmostEqual(cost, 9133.6097158191551) def test_convolve(self): f = statistics.convolve @@ -250,7 +290,7 @@ class Statistics(unittest.TestCase): def test_find_k(self): f = statistics.find_k g = statistics.pr_file_loss - plist = [.9] * 10 + [.8] * 10 + plist = [.9] * 10 + [.8] * 10 # N=20 t = .0001 k = f(plist, t) self.failUnlessEqual(k, 10) diff --git a/src/allmydata/util/statistics.py b/src/allmydata/util/statistics.py index 0f9bc3d4..4df02382 100644 --- a/src/allmydata/util/statistics.py +++ b/src/allmydata/util/statistics.py @@ -4,7 +4,7 @@ from __future__ import division from mathutil import round_sigfigs import math -import array +import sys def pr_file_loss(p_list, k): """ @@ -87,13 +87,13 @@ def survival_pmf_via_conv(p_list): pmf_list = [ [1 - p, p] for p in p_list ]; return reduce(convolve, pmf_list) -def print_pmf(pmf, n=4): +def print_pmf(pmf, n=4, out=sys.stdout): """ Print a PMF in a readable form, with values rounded to n significant digits. """ for k, p in enumerate(pmf): - print "i=" + str(k) + ":", round_sigfigs(p, n) + print >>out, "i=" + str(k) + ":", round_sigfigs(p, n) def pr_backup_file_loss(p_list, backup_p, k): """ @@ -136,6 +136,7 @@ def find_k_from_pmf(pmf, target_loss_prob): if loss_prob > target_loss_prob: return k + # we shouldn't be able to get here, since sum(pmf)==1.0 k = len(pmf) - 1 return k @@ -166,23 +167,27 @@ def repair_count_pmf(survival_pmf, k): def bandwidth_cost_function(file_size, shares, k, ul_dl_ratio): return file_size + float(file_size) / k * shares * ul_dl_ratio -def mean_repair_cost(cost_function, file_size, survival_pmf, k): +def mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio): """ Return the expected cost for a repair run on a file with the given - survival_pmf and requiring k shares. + survival_pmf and requiring k shares, in which upload cost is + 'ul_dl_ratio' times download cost. """ repair_pmf = repair_count_pmf(survival_pmf, k) - exp_cnt = sum([d * repair_pmf[d] for d in range(1, len(repair_pmf))]) - return cost_function(file_size, exp_cnt, k) + expected_cost = sum([cost_function(file_size, new_shares, k, ul_dl_ratio) + * repair_pmf[new_shares] + for new_shares in range(1, len(repair_pmf))]) + return expected_cost -def eternal_repair_cost(cost_function, file_size, survival_pmf, k, discount_rate=0): +def eternal_repair_cost(cost_function, file_size, survival_pmf, k, + discount_rate=0, ul_dl_ratio=1.0): """ Calculate the eternal repair cost for a file that is aggressively - repaired. + repaired, i.e. the sum of repair costs until the file is dead. """ - c = mean_repair_cost(cost_function, file_size, survival_pmf, k) + c = mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio) f = 1 - sum(survival_pmf[0:k]) - r = discount_rate + r = float(discount_rate) return (c * (1-r)) / (1 - (1-r) * f) @@ -259,9 +264,6 @@ def binomial_coeff(n, k): """ assert n >= k - if k > n: - return 0 - if k > n/2: k = n - k