From: Brian Warner <warner@allmydata.com>
Date: Thu, 6 Sep 2007 01:16:21 +0000 (-0700)
Subject: provisioning.py: add file/server availability numbers
X-Git-Url: https://git.rkrishnan.org/(%5B%5E?a=commitdiff_plain;h=849dbb4950e2d0c09a07ab67d69aaa9659501282;p=tahoe-lafs%2Ftahoe-lafs.git

provisioning.py: add file/server availability numbers
---

diff --git a/src/allmydata/provisioning.py b/src/allmydata/provisioning.py
index fdfb582b..a666dbcf 100644
--- a/src/allmydata/provisioning.py
+++ b/src/allmydata/provisioning.py
@@ -1,10 +1,43 @@
 
 from nevow import inevow, loaders, rend, tags as T
 from twisted.python import util
+import math
 
 def getxmlfile(name):
     return loaders.xmlfile(util.sibpath(__file__, "web/%s" % name))
 
+# factorial and binomial copied from
+# http://mail.python.org/pipermail/python-list/2007-April/435718.html
+
+def factorial(n):
+    """factorial(n): return the factorial of the integer n.
+    factorial(0) = 1
+    factorial(n) with n<0 is -factorial(abs(n))
+    """
+    result = 1
+    for i in xrange(1, abs(n)+1):
+        result *= i
+    if n >= 0:
+        return result
+    else:
+        return -result
+
+def binomial(n, k):
+    if not 0 <= k <= n:
+        return 0
+    if k == 0 or k == n:
+        return 1
+    # calculate n!/k! as one product, avoiding factors that
+    # just get canceled
+    P = k+1
+    for i in xrange(k+2, n+1):
+        P *= i
+    # if you are paranoid:
+    # C, rem = divmod(P, factorial(n-k))
+    # assert rem == 0
+    # return C
+    return P//factorial(n-k)
+
 class ProvisioningTool(rend.Page):
     addSlash = True
     docFactory = getxmlfile("provisioning.xhtml")
@@ -132,8 +165,10 @@ class ProvisioningTool(rend.Page):
                   " no convergence)", i_sharing_ratio)
 
         # Encoding parameters
-        encoding_choices = [("3-of-10", "3-of-10"),
-                            ("25-of-100", "25-of-100"),
+        encoding_choices = [("3-of-10", "3.3x (3-of-10)"),
+                            ("5-of-10", "2x (5-of-10)"),
+                            ("8-of-10", "1.25x (8-of-10)"),
+                            ("25-of-100", "4x (25-of-100)"),
                             ]
         encoding_parameters, i_encoding_parameters = \
                              get_and_set("encoding_parameters",
@@ -161,6 +196,20 @@ class ProvisioningTool(rend.Page):
         add_input("Servers",
                   "How many servers are there?", i_num_servers)
 
+        # availability is measured in dBA = -dBF, where 0dBF is 100% failure,
+        # 10dBF is 10% failure, 20dBF is 1% failure, etc
+        server_dBA_choices = [ (20, "99% [20dBA] (14min/day or 3.5days/year)"),
+                               (30, "99.9% [30dBA] (87sec/day or 9hours/year)"),
+                               (40, "99.99% [40dBA] (60sec/week or 53min/year)"),
+                               (50, "99.999% [50dBA] (5min per year)"),
+                               ]
+        server_dBA, i_server_availability = \
+                    get_and_set("server_availability",
+                                server_dBA_choices,
+                                20, int)
+        add_input("Servers",
+                  "What is the server availability?", i_server_availability)
+
         # deletion/gc/ownership mode
         ownership_choices = [ ("A", "no deletion, no gc, no owners"),
                               ("B", "deletion, no gc, no owners"),
@@ -429,6 +478,19 @@ class ProvisioningTool(rend.Page):
                              (100.0 * share_data_per_server /
                               share_space_per_server)])
 
+            # availability
+            file_dBA = self.file_availability(k, n, server_dBA)
+            user_files_dBA = self.many_files_availability(file_dBA,
+                                                          files_per_user)
+            all_files_dBA = self.many_files_availability(file_dBA, total_files)
+            add_output("Users",
+                       T.div["availability of: ",
+                             "arbitrary file = %d dBA, " % file_dBA,
+                             "all files of user1 = %d dBA, " % user_files_dBA,
+                             "all files in grid = %d dBA" % all_files_dBA,
+                             ],
+                       )
+
 
         all_sections = []
         all_sections.append(build_section("Users"))
@@ -449,3 +511,53 @@ class ProvisioningTool(rend.Page):
               ]
 
         return f
+
+    def file_availability(self, k, n, server_dBA):
+        """
+        The full formula for the availability of a specific file is::
+
+         1 - sum([choose(N,i) * p**i * (1-p)**(N-i)] for i in range(k)])
+
+        Where choose(N,i) = N! / ( i! * (N-i)! ) . Note that each term of
+        this summation is the probability that there are exactly 'i' servers
+        available, and what we're doing is adding up the cases where i is too
+        low.
+
+        This is a nuisance to calculate at all accurately, especially once N
+        gets large, and when p is close to unity. So we make an engineering
+        approximation: if (1-p) is very small, then each [i] term is much
+        larger than the [i-1] term, and the sum is dominated by the i=k-1
+        term. This only works for (1-p) < 10%, and when the choose() function
+        doesn't rise fast enough to compensate. For high-expansion encodings
+        (3-of-10, 25-of-100), the choose() function is rising at the same
+        time as the (1-p)**(N-i) term, so that's not an issue. For
+        low-expansion encodings (7-of-10, 75-of-100) the two values are
+        moving in opposite directions, so more care must be taken.
+
+        Note that the p**i term has only a minor effect as long as (1-p)*N is
+        small, and even then the effect is attenuated by the 1-p term.
+        """
+
+        assert server_dBA > 9  # >=90% availability to use the approximation
+        factor = binomial(n, k-1)
+        factor_dBA = 10 * math.log10(factor)
+        exponent = n - k + 1
+        file_dBA = server_dBA * exponent - factor_dBA
+        return file_dBA
+
+    def many_files_availability(self, file_dBA, num_files):
+        """The probability that 'num_files' independent bernoulli trials will
+        succeed (i.e. we can recover all files in the grid at any given
+        moment) is p**num_files . Since p is close to unity, we express in p
+        in dBA instead, so we can get useful precision on q (=1-p), and then
+        the formula becomes::
+
+         P_some_files_unavailable = 1 - (1 - q)**num_files
+
+        That (1-q)**n expands with the usual binomial sequence, 1 - nq +
+        Xq**2 ... + Xq**n . We use the same approximation as before, since we
+        know q is close to zero, and we get to ignore all the terms past -nq.
+        """
+
+        many_files_dBA = file_dBA - 10 * math.log10(num_files)
+        return many_files_dBA