From: Shawn Willden Date: Wed, 14 Jan 2009 03:12:35 +0000 (-0700) Subject: Statistics module X-Git-Tag: allmydata-tahoe-1.4.0~228 X-Git-Url: https://git.rkrishnan.org/components/com_hotproperty/simplejson/running.html?a=commitdiff_plain;h=21832280da410f8e7b2d5a055b32ca1e4e996c67;p=tahoe-lafs%2Ftahoe-lafs.git Statistics module Added a statistics module for calculating various facets of share survival statistics. --- diff --git a/docs/lossmodel.lyx b/docs/lossmodel.lyx new file mode 100644 index 00000000..d2d852bf --- /dev/null +++ b/docs/lossmodel.lyx @@ -0,0 +1,688 @@ +#LyX 1.6.1 created this file. For more info see http://www.lyx.org/ +\lyxformat 345 +\begin_document +\begin_header +\textclass amsart +\use_default_options true +\begin_modules +theorems-ams +theorems-ams-extended +\end_modules +\language english +\inputencoding auto +\font_roman default +\font_sans default +\font_typewriter default +\font_default_family default +\font_sc false +\font_osf false +\font_sf_scale 100 +\font_tt_scale 100 + +\graphics default +\paperfontsize default +\spacing single +\use_hyperref false +\papersize default +\use_geometry false +\use_amsmath 1 +\use_esint 1 +\cite_engine basic +\use_bibtopic false +\paperorientation portrait +\secnumdepth 3 +\tocdepth 3 +\paragraph_separation indent +\defskip medskip +\quotes_language english +\papercolumns 1 +\papersides 1 +\paperpagestyle default +\tracking_changes false +\output_changes false +\author "" +\author "" +\end_header + +\begin_body + +\begin_layout Title +Tahoe Distributed Filesharing System Loss Model +\end_layout + +\begin_layout Author +Shawn Willden +\end_layout + +\begin_layout Email +shawn@willden.org +\end_layout + +\begin_layout Abstract +The abstract goes here +\end_layout + +\begin_layout Section +Problem Statement +\end_layout + +\begin_layout Standard +The allmydata Tahoe distributed file system uses Reed-Solomon erasure coding + to split files into +\begin_inset Formula $N$ +\end_inset + + shares, each of which is then delivered to a randomly-selected peer in + a distributed network. + The file can later be reassembled from any +\begin_inset Formula $k\leq N$ +\end_inset + + of the shares, if they are available. +\end_layout + +\begin_layout Standard +Over time shares are lost for a variety of reasons. + Storage servers may crash, be destroyed or simply be removed from the network. + To mitigate such losses, Tahoe network clients employ a repair agent which + scans the peers once per time period +\begin_inset Formula $A$ +\end_inset + + and determines how many of the shares remain. + If less than +\begin_inset Formula $R$ +\end_inset + + ( +\begin_inset Formula $k\leq R\leq N$ +\end_inset + +) shares remain, then the repairer reconstructs the file shares and redistribute +s the missing ones, bringing the availability back up to full. +\end_layout + +\begin_layout Standard +The question we're trying to answer is "What's the probability that we'll + be able to reassemble the file at some later time +\begin_inset Formula $T$ +\end_inset + +?". + We'd also like to be able to determine what values we should choose for + +\begin_inset Formula $k$ +\end_inset + +, +\begin_inset Formula $N$ +\end_inset + +, +\begin_inset Formula $A$ +\end_inset + +, and +\begin_inset Formula $R$ +\end_inset + + in order to ensure +\begin_inset Formula $Pr[loss]\leq t$ +\end_inset + + for some threshold probability +\begin_inset Formula $t$ +\end_inset + +. + This is an optimization problem because although we could obtain very low + +\begin_inset Formula $Pr[loss]$ +\end_inset + + by choosing small +\begin_inset Formula $k,$ +\end_inset + + large +\begin_inset Formula $N$ +\end_inset + +, small +\begin_inset Formula $A$ +\end_inset + +, and setting +\begin_inset Formula $R=N$ +\end_inset + +, these choices have costs. + The peer storage and bandwidth consumed by the share distribution process + are approximately +\begin_inset Formula $\nicefrac{N}{k}$ +\end_inset + + times the size of the original file, so we would like to reduce this ratio + as far as possible consistent with +\begin_inset Formula $Pr[loss]\leq t$ +\end_inset + +. + Likewise, frequent and aggressive repair process can be used to ensure + that the number of shares available at any time is very close to +\begin_inset Formula $N,$ +\end_inset + + but at a cost in bandwidth. +\end_layout + +\begin_layout Section +Reliability +\end_layout + +\begin_layout Standard +The probability that the file becomes unrecoverable is dependent upon the + probability that the peers to whom we send shares are able to return those + copies on demand. + Shares that are returned in corrupted form can be detected and discarded, + so there is no need to distinguish between corruption and loss. +\end_layout + +\begin_layout Standard +There are a large number of factors that affect share availability. + Availability can be temporarily interrupted by peer unavailability, due + to network outages, power failures or administrative shutdown, among other + reasons. + Availability can be permanently lost due to failure or corruption of storage + media, catastrophic damage to the peer system, administrative error, withdrawal + from the network, malicious corruption, etc. +\end_layout + +\begin_layout Standard +The existence of intermittent failure modes motivates the introduction of + a distinction between +\noun on +availability +\noun default + and +\noun on +reliability +\noun default +. + Reliability is the probability that a share is retrievable assuming intermitten +t failures can be waited out, so reliability considers only permanent failures. + Availability considers all failures, and is focused on the probability + of retrieval within some defined time frame. +\end_layout + +\begin_layout Standard +Another consideration is that some failures affect multiple shares. + If multiple shares of a file are stored on a single hard drive, for example, + failure of that drive may lose them all. + Catastrophic damage to a data center may destroy all shares on all peers + in that data center. +\end_layout + +\begin_layout Standard +While the types of failures that may occur are pretty consistent across + even very different peers, their probabilities differ dramatically. + A professionally-administered blade server with redundant storage, power + and Internet located in a carefully-monitored data center with automatic + fire suppression systems is much less likely to become either temporarily + or permanently unavailable than the typical virus and malware-ridden home + computer on a single cable modem connection. + A variety of situations in between exist as well, such as the case of the + author's home file server, which is administered by an IT professional + and uses RAID level 6 redundant storage, but runs on old, cobbled-together + equipment, and has a consumer-grade Internet connection. +\end_layout + +\begin_layout Standard +To begin with, let's use a simple definition of reliability: +\end_layout + +\begin_layout Definition + +\noun on +Reliability +\noun default + is the probability +\begin_inset Formula $p_{i}$ +\end_inset + + that a share +\begin_inset Formula $s_{i}$ +\end_inset + + will surve to (be retrievable at) time +\begin_inset Formula $T=A$ +\end_inset + +, ignoring intermittent failures. + That is, the probability that the share will be retrievable at the end + of the current repair cycle, and therefore usable by the repairer to regenerate + any lost shares. +\end_layout + +\begin_layout Definition +Reliability is clearly dependent on +\begin_inset Formula $A$ +\end_inset + +. + Short repair cycles offer less time for shares to +\begin_inset Quotes eld +\end_inset + +decay +\begin_inset Quotes erd +\end_inset + + into unavailability. +\end_layout + +\begin_layout Subsection +Fixed Reliability +\end_layout + +\begin_layout Standard +In the simplest case, the peers holding the file shares all have the same + reliability +\begin_inset Formula $p$ +\end_inset + +, and are all independent from one another. + Let +\begin_inset Formula $K$ +\end_inset + + be a random variable that represents the number of shares that survive + +\begin_inset Formula $A$ +\end_inset + +. + Each share's survival can be viewed as an indepedent Bernoulli trial with + a succes probability of +\begin_inset Formula $p$ +\end_inset + +, which means that +\begin_inset Formula $K$ +\end_inset + + follows the binomial distribution with paramaters +\begin_inset Formula $N$ +\end_inset + + and +\begin_inset Formula $p$ +\end_inset + + ( +\begin_inset Formula $K\sim B(N,p)$ +\end_inset + +). + The probability mass function (PMF) of the binomial distribution is: +\begin_inset Formula \begin{equation} +Pr(K=i)=f(i;N,p)=\binom{n}{i}p^{i}(1-p)^{n-i}\label{eq:binomial-pdf}\end{equation} + +\end_inset + + +\end_layout + +\begin_layout Standard +A file survives if at least +\begin_inset Formula $k$ +\end_inset + + of the +\begin_inset Formula $N$ +\end_inset + + shares survive. + Equation +\begin_inset CommandInset ref +LatexCommand ref +reference "eq:binomial-pdf" + +\end_inset + + gives the probability that exactly +\begin_inset Formula $i$ +\end_inset + + shares survive, so the probability that fewer than +\begin_inset Formula $k$ +\end_inset + + survive is the sum of the probabilities that +\begin_inset Formula $0,1,2,\ldots,k-1$ +\end_inset + + shares survive. + That is: +\end_layout + +\begin_layout Standard +\begin_inset Formula \begin{equation} +Pr[failure]=\sum_{i=0}^{k-1}\binom{n}{i}p^{i}(1-p)^{n-i}\label{eq:simple-failure}\end{equation} + +\end_inset + + +\end_layout + +\begin_layout Subsection +Independent Reliability +\end_layout + +\begin_layout Standard +Equation +\begin_inset CommandInset ref +LatexCommand ref +reference "eq:simple-failure" + +\end_inset + + assumes that each share has the same probability of survival, but as explained + above, this is not typically true. + A more accurate model allows each share +\begin_inset Formula $s_{i}$ +\end_inset + + an independent probability of survival +\begin_inset Formula $p_{i}$ +\end_inset + +. + Each share's survival can still be treated as an independent Bernoulli + trial, but with success probability +\begin_inset Formula $p_{i}$ +\end_inset + +. + Under this assumption, +\begin_inset Formula $K$ +\end_inset + + follows a generalized distribution with parameters +\begin_inset Formula $N$ +\end_inset + + and +\begin_inset Formula $p_{i},1\leq i\leq N$ +\end_inset + +. +\end_layout + +\begin_layout Standard +The PMF for this generalized +\begin_inset Formula $K$ +\end_inset + + does not have a simple closed-form representation. + However, the PMFs for random variables representing individual share survival + do. + Let +\begin_inset Formula $S_{i}$ +\end_inset + + be a random variable such that: +\end_layout + +\begin_layout Standard +\begin_inset Formula \[ +S_{i}=\begin{cases} +1 & \textnormal{if }s_{i}\textnormal{ survives}\\ +0 & \textnormal{if }s_{i}\textnormal{ fails}\end{cases}\] + +\end_inset + + +\end_layout + +\begin_layout Standard +The PMF for +\begin_inset Formula $Si$ +\end_inset + + is very simple, +\begin_inset Formula $Pr(S_{i}=1)=p_{i}$ +\end_inset + + and +\begin_inset Formula $Pr(S_{i}=0)=p_{i}$ +\end_inset + +. +\end_layout + +\begin_layout Standard +Observe that +\begin_inset Formula $\sum_{i=1}^{N}S_{i}=K$ +\end_inset + +. + Effectively, +\begin_inset Formula $K$ +\end_inset + + has just been separated into the series of Bernoulli trials that make it + up. +\end_layout + +\begin_layout Standard +The discrete convolution theorem states that given random variables +\begin_inset Formula $X$ +\end_inset + + and +\begin_inset Formula $Y$ +\end_inset + + and their sum +\begin_inset Formula $Z=X+Y$ +\end_inset + +, if +\begin_inset Formula $Pr[X=x]=f(x)$ +\end_inset + + and +\begin_inset Formula $Pr[Y=y]=f(y)$ +\end_inset + + then +\begin_inset Formula $Pr[Z=z]=(f\star g)(z)$ +\end_inset + + where +\begin_inset Formula $\star$ +\end_inset + + denotes the convolution operation. + Stated in English, the probability mass function of the sum of two random + variables is the convolution of the probability mass functions of the two + random variables. +\end_layout + +\begin_layout Standard +Discrete convolution is defined as +\end_layout + +\begin_layout Standard +\begin_inset Formula \[ +(f\star g)(n)=\sum_{m=-\infty}^{\infty}f(m)\cdot g(n-m)\] + +\end_inset + + +\end_layout + +\begin_layout Standard +The infinite summation is no problem because the probability mass functions + we need to convolve are zero outside of a small range. +\end_layout + +\begin_layout Standard +According to the discrete convolution theorem, then, if +\begin_inset Formula $Pr[K=i]=f(i)$ +\end_inset + + and +\begin_inset Formula $Pr[S_{i}=j]=g_{i}(j)$ +\end_inset + +, then +\begin_inset Formula $ $ +\end_inset + + +\begin_inset Formula $f=g_{1}\star g_{2}\star g_{3}\star\ldots\star g_{N}$ +\end_inset + +. + Since convolution is associative, this can also be written as +\begin_inset Formula $ $ +\end_inset + + +\begin_inset Formula \begin{equation} +f=(g_{1}\star g_{2})\star g_{3})\star\ldots)\star g_{N})\label{eq:convolution}\end{equation} + +\end_inset + +which enables +\begin_inset Formula $f$ +\end_inset + + to be implemented as a sequence of convolution operations on the simple + PMFs of the random variables +\begin_inset Formula $S_{i}$ +\end_inset + +. + In fact, as values of +\begin_inset Formula $N$ +\end_inset + + get large, equation +\begin_inset CommandInset ref +LatexCommand ref +reference "eq:convolution" + +\end_inset + + turns out to be a more effective means of computing the PMF of +\begin_inset Formula $K$ +\end_inset + + even in the case of the standard bernoulli distribution, primarily because + the binomial calculation in equation +\begin_inset CommandInset ref +LatexCommand ref +reference "eq:binomial-pdf" + +\end_inset + + produces very large values that overflow unless arbitrary precision numeric + representations are used, or unless the binomial calculation is very cleverly + interleaved with the powers of +\begin_inset Formula $p$ +\end_inset + + and +\begin_inset Formula $1-p$ +\end_inset + + to keep the values manageable. +\end_layout + +\begin_layout Standard +Note also that it is not necessary to have very simple PMFs like those of + the +\begin_inset Formula $S_{i}$ +\end_inset + +. + Any share or set of shares that has a known PMF can be combined with any + other set with a known PMF by convolution, as long as the two share sets + are independent. + Since PMFs are easily represented as simple lists of probabilities, where + the +\begin_inset Formula $i$ +\end_inset + +th element in the list corresponds to +\begin_inset Formula $Pr[K=i]$ +\end_inset + +, these functions are easily managed in software, and computing the convolution + is both simple and efficient. +\end_layout + +\begin_layout Subsection +Multiple Failure Modes +\end_layout + +\begin_layout Standard +In modeling share survival probabilities, it's useful to be able to analyze + separately each of the various failure modes. + If reliable statistics for disk failure can be obtained, then a probability + mass function for that form of failure can be generated. + Similarly, statistics on other hardware failures, administrative errors, + network losses, etc., can all be estimated independently. + If those estimates can then be combined into a single PMF for that server, + then we can use it to predict failures for that server. +\end_layout + +\begin_layout Standard +In the case of independent failure modes for a single server, this is very + simple to do. + If +\begin_inset Formula $p_{i,j}$ +\end_inset + + is the probability of survival of the +\begin_inset Formula $j$ +\end_inset + +th failure mode of server +\begin_inset Formula $i$ +\end_inset + +, and there are +\begin_inset Formula $m$ +\end_inset + + failure modes then +\begin_inset Formula \[ +p_{i}=\prod_{j=1}^{m}p_{i,j}\] + +\end_inset + + is the probability of server +\begin_inset Formula $i$ +\end_inset + +'s survival and +\begin_inset Formula \[ +Pr[S_{i}=k]=f(k)=\begin{cases} +1-p_{i} & k=0\\ +p_{i} & k=1\end{cases}\] + +\end_inset + + is the full survival PMF. +\end_layout + +\begin_layout Standard + +\end_layout + +\end_body +\end_document diff --git a/src/allmydata/test/test_util.py b/src/allmydata/test/test_util.py index 32abb7ef..973c798a 100644 --- a/src/allmydata/test/test_util.py +++ b/src/allmydata/test/test_util.py @@ -1,7 +1,7 @@ def foo(): pass # keep the line number constant -import os, time +import os, time, random from twisted.trial import unittest from twisted.internet import defer, reactor from twisted.python import failure @@ -9,6 +9,7 @@ from twisted.python import failure from allmydata.util import base32, idlib, humanreadable, mathutil, hashutil from allmydata.util import assertutil, fileutil, deferredutil, abbreviate from allmydata.util import limiter, time_format, pollmixin, cachedir +from allmydata.util import statistics class Base32(unittest.TestCase): def test_b2a_matches_Pythons(self): @@ -163,6 +164,108 @@ class Math(unittest.TestCase): self.failUnlessEqual(f([0,0,0,4]), 1) self.failUnlessAlmostEqual(f([0.0, 1.0, 1.0]), .666666666666) + def test_round_sigfigs(self): + f = mathutil.round_sigfigs + self.failUnlessEqual(f(22.0/3, 4), 7.3330000000000002) + +class Statistics(unittest.TestCase): + def should_assert(self, msg, func, *args, **kwargs): + try: + func(*args, **kwargs) + self.fail(msg) + except AssertionError, e: + pass + + def failUnlessListEqual(self, a, b, msg = None): + self.failUnlessEqual(len(a), len(b)) + for i in range(len(a)): + self.failUnlessEqual(a[i], b[i], msg) + + def failUnlessListAlmostEqual(self, a, b, places = 7, msg = None): + self.failUnlessEqual(len(a), len(b)) + for i in range(len(a)): + self.failUnlessAlmostEqual(a[i], b[i], places, msg) + + def test_binomial_coeff(self): + f = statistics.binomial_coeff + self.failUnlessEqual(f(20, 0), 1) + self.failUnlessEqual(f(20, 1), 20) + self.failUnlessEqual(f(20, 2), 190) + self.failUnlessEqual(f(20, 8), f(20, 12)) + self.should_assert("Should assert if n < k", f, 2, 3) + + def test_binomial_distribution_pmf(self): + f = statistics.binomial_distribution_pmf + + pmf_comp = f(2, .1) + pmf_stat = [0.81, 0.18, 0.01] + self.failUnlessListAlmostEqual(pmf_comp, pmf_stat) + + # Summing across a PMF should give the total probability 1 + self.failUnlessAlmostEqual(sum(pmf_comp), 1) + self.should_assert("Should assert if not 0<=p<=1", f, 1, -1) + self.should_assert("Should assert if n < 1", f, 0, .1) + + def test_survival_pmf(self): + f = statistics.survival_pmf + # Cross-check binomial-distribution method against convolution + # method. + p_list = [.9999] * 100 + [.99] * 50 + [.8] * 20 + pmf1 = statistics.survival_pmf_via_conv(p_list) + pmf2 = statistics.survival_pmf_via_bd(p_list) + self.failUnlessListAlmostEqual(pmf1, pmf2) + self.failUnlessTrue(statistics.valid_pmf(pmf1)) + self.should_assert("Should assert if p_i > 1", f, [1.1]); + self.should_assert("Should assert if p_i < 0", f, [-.1]); + + + def test_convolve(self): + f = statistics.convolve + v1 = [ 1, 2, 3 ] + v2 = [ 4, 5, 6 ] + v3 = [ 7, 8 ] + v1v2result = [ 4, 13, 28, 27, 18 ] + # Convolution is commutative + r1 = f(v1, v2) + r2 = f(v2, v1) + self.failUnlessListEqual(r1, r2, "Convolution should be commutative") + self.failUnlessListEqual(r1, v1v2result, "Didn't match known result") + # Convolution is associative + r1 = f(f(v1, v2), v3) + r2 = f(v1, f(v2, v3)) + self.failUnlessListEqual(r1, r2, "Convolution should be associative") + # Convolution is distributive + r1 = f(v3, [ a + b for a, b in zip(v1, v2) ]) + tmp1 = f(v3, v1) + tmp2 = f(v3, v2) + r2 = [ a + b for a, b in zip(tmp1, tmp2) ] + self.failUnlessListEqual(r1, r2, "Convolution should be distributive") + # Convolution is scalar multiplication associative + tmp1 = f(v1, v2) + r1 = [ a * 4 for a in tmp1 ] + tmp2 = [ a * 4 for a in v1 ] + r2 = f(tmp2, v2) + self.failUnlessListEqual(r1, r2, "Convolution should be scalar multiplication associative") + + def test_find_k(self): + f = statistics.find_k + g = statistics.pr_file_loss + plist = [.9] * 10 + [.8] * 10 + t = .0001 + k = f(plist, t) + self.failUnlessEqual(k, 10) + self.failUnless(g(plist, k) < t) + + def test_pr_file_loss(self): + f = statistics.pr_file_loss + plist = [.5] * 10 + self.failUnlessEqual(f(plist, 3), .0546875) + + def test_pr_backup_file_loss(self): + f = statistics.pr_backup_file_loss + plist = [.5] * 10 + self.failUnlessEqual(f(plist, .5, 3), .02734375) + class Asserts(unittest.TestCase): def should_assert(self, func, *args, **kwargs): diff --git a/src/allmydata/util/mathutil.py b/src/allmydata/util/mathutil.py index 1d83e28c..81b12cfb 100644 --- a/src/allmydata/util/mathutil.py +++ b/src/allmydata/util/mathutil.py @@ -73,3 +73,7 @@ def log_floor(n, b): p *= b k += 1 return k - 1 + +def round_sigfigs(f, n): + fmt = "%." + str(n-1) + "e" + return float(fmt % f) diff --git a/src/allmydata/util/statistics.py b/src/allmydata/util/statistics.py new file mode 100644 index 00000000..5315e95e --- /dev/null +++ b/src/allmydata/util/statistics.py @@ -0,0 +1,225 @@ +# Copyright (c) 2009 Shawn Willden +# mailto:shawn@willden.org + +from __future__ import division +from mathutil import round_sigfigs +import math +import array + +def pr_file_loss(p_list, k): + """ + Probability of single-file loss for shares with reliabilities in + p_list. + + Computes the probability that a single file will become + unrecoverable, based on the individual share survival + probabilities and and k (number of shares needed for recovery). + + Example: pr_file_loss([.9] * 5 + [.99] * 5, 3) returns the + probability that a file with k=3, N=10 and stored on five servers + with reliability .9 and five servers with reliability .99 is lost. + + See survival_pmf docstring for important statistical assumptions. + + """ + assert 0 < k <= len(p_list) + assert valid_probability_list(p_list) + + # Sum elements 0 through k-1 of the share set PMF to get the + # probability that less than k shares survived. + return sum(survival_pmf(p_list)[0:k]) + +def survival_pmf(p_list): + """ + Return the collective PMF of share survival count for a set of + shares with the individual survival probabilities in p_list. + + Example: survival_pmf([.99] * 10 + [.8] * 6) returns the + probability mass function for the number of shares that will + survive from an initial set of 16, 10 with p=0.99 and 6 with + p=0.8. The ith element of the resulting list is the probability + that exactly i shares will survive. + + This calculation makes the following assumptions: + + 1. p_list[i] is the probability that any individual share will + will survive during the time period in question (whatever that may + be). + + 2. The share failures are "independent", in the statistical + sense. Note that if a group of shares are stored on the same + machine or even in the same data center, they are NOT independent + and this calculation is therefore wrong. + """ + assert valid_probability_list(p_list) + + pmf = survival_pmf_via_conv(p_list) + + assert valid_pmf(pmf) + return pmf + +def survival_pmf_via_bd(p_list): + """ + Compute share survival PMF using the binomial distribution PMF as + much as possible. + + This is more efficient than the convolution method below, but + doesn't work for large numbers of shares because the + binomial_coeff calculation blows up. Since the efficiency gains + only matter in the case of large numbers of shares, it's pretty + much useless except for testing the convolution methond. + + Note that this function does little to no error checking and is + intended for internal use and testing only. + """ + pmf_list = [ binomial_distribution_pmf(p_list.count(p), p) + for p in set(p_list) ] + return reduce(convolve, pmf_list) + +def survival_pmf_via_conv(p_list): + """ + Compute share survival PMF using iterated convolution of trivial + PMFs. + + Note that this function does little to no error checking and is + intended for internal use and testing only. + """ + pmf_list = [ [1 - p, p] for p in p_list ]; + return reduce(convolve, pmf_list) + +def print_pmf(pmf, n): + """ + Print a PMF in a readable form, with values rounded to n + significant digits. + """ + for k, p in enumerate(pmf): + print "i=" + str(k) + ":", round_sigfigs(p, n) + +def pr_backup_file_loss(p_list, backup_p, k): + """ + Probability of single-file loss in a backup context + + Same as pr_file_loss, except it factors in the probability of + survival of the original source, specified as backup_p. Because + that's a precondition to caring about the availability of the + backup, it's an independent event. + """ + assert valid_probability_list(p_list) + assert 0 < backup_p <= 1 + assert 0 < k <= len(p_list) + + return pr_file_loss(p_list, k) * (1 - backup_p) + + +def find_k(p_list, target_loss_prob): + """ + Find the highest k value that achieves the targeted loss + probability, given the share reliabilities given in p_list. + """ + assert valid_probability_list(p_list) + assert 0 < target_loss_prob < 1 + + pmf = survival_pmf(p_list) + return find_k_from_pmf(pmf, target_loss_prob) + +def find_k_from_pmf(pmf, target_loss_prob): + """ + Find the highest k value that achieves the targeted loss + probability, given the share survival PMF given in pmf. + """ + assert valid_pmf(pmf) + assert 0 < target_loss_prob < 1 + + loss_prob = 0.0 + for k, p_k in enumerate(pmf): + loss_prob += p_k + if loss_prob > target_loss_prob: + return k + + k = len(pmf) - 1 + return k + +def valid_pmf(pmf): + """ + Validate that pmf looks like a proper discrete probability mass + function in list form. + + Returns true if the elements of pmf sum to 1. + """ + return round(sum(pmf),5) == 1.0 + +def valid_probability_list(p_list): + """ + Validate that p_list is a list of probibilities + """ + for p in p_list: + if p < 0 or p > 1: + return False + + return True + +def convolve(list_a, list_b): + """ + Returns the discrete convolution of two lists. + + Given two random variables X and Y, the convolution of their + probability mass functions Pr(X) and Pr(Y) is equal to the + Pr(X+Y). + """ + n = len(list_a) + m = len(list_b) + + result = [] + for i in range(n + m - 1): + sum = 0.0 + + lower = max(0, i - n + 1) + upper = min(m - 1, i) + + for j in range(lower, upper+1): + sum += list_a[i-j] * list_b[j] + + result.append(sum) + + return result + +def binomial_distribution_pmf(n, p): + """ + Returns Pr(K), where K ~ B(n,p), as a list of values. + + Returns the full probability mass function of a B(n, p) as a list + of values, where the kth element is Pr(K=k), or, in the Tahoe + context, the probability that exactly k copies of a file share + survive, when placed on n independent servers with survival + probability p. + """ + assert p >= 0 and p <= 1, 'p=%s must be in the range [0,1]'%p + assert n > 0 + + result = [] + for k in range(n+1): + result.append(math.pow(p , k ) * + math.pow(1 - p, n - k) * + binomial_coeff(n, k)) + + assert valid_pmf(result) + return result; + +def binomial_coeff(n, k): + """ + Returns the number of ways that k items can be chosen from a set + of n. + """ + assert n >= k + + if k > n: + return 0 + + if k > n/2: + k = n - k + + accum = 1.0 + for i in range(1, k+1): + accum = accum * (n - k + i) // i; + + return int(accum + 0.5)