remove 'provisioning'/'reliability' from WUI, add to misc/operations_helpers

author Brian Warner <warner@lothar.com>

Wed, 15 Feb 2012 18:18:53 +0000 (18:18 +0000)

committer Brian Warner <warner@lothar.com>

Thu, 16 Feb 2012 22:29:05 +0000 (22:29 +0000)
author Brian Warner <warner@lothar.com>
Wed, 15 Feb 2012 18:18:53 +0000 (18:18 +0000)
committer Brian Warner <warner@lothar.com>
Thu, 16 Feb 2012 22:29:05 +0000 (22:29 +0000)
diff --git a/docs/architecture.rst b/docs/architecture.rst

index 3a9e08fbc0cc3eb92ed680019c3dc15ffc775e8a..362a179208ac229faf80dc9f65d22820d0e9d79e 100644 (file)
--- a/docs/architecture.rst
+++ b/docs/architecture.rst
@@ -553,9 +553,3 @@ will be able to reduce the expansion factor down to a bare minimum while
  still retaining high reliability, but large unstable grids (where nodes are
  coming and going very quickly) may require more repair/verification bandwidth
  than actual upload/download traffic.
  still retaining high reliability, but large unstable grids (where nodes are
  coming and going very quickly) may require more repair/verification bandwidth
  than actual upload/download traffic.
-
-Tahoe-LAFS nodes that run a webserver have a page dedicated to provisioning
-decisions: this tool may help you evaluate different expansion factors and
-view the disk consumption of each. It is also acquiring some sections with
-availability/reliability numbers, as well as preliminary cost analysis data.
-This tool will continue to evolve as our analysis improves.
diff --git a/docs/frontends/webapi.rst b/docs/frontends/webapi.rst

index 47ab75429ecccdf642c55e9d720c363043a3a9d1..b67ee33450935e95b8a33b1d99f57aa510fe0a3e 100644 (file)
--- a/docs/frontends/webapi.rst
+++ b/docs/frontends/webapi.rst
@@ -1805,17 +1805,6 @@ This is the "Welcome Page", and contains a few distinct sections::
   implementation hashes synchronously, so clients will probably never see
   progress-hash!=1.0).
  
   implementation hashes synchronously, so clients will probably never see
   progress-hash!=1.0).
  
-``GET /provisioning/``
-
- This page provides a basic tool to predict the likely storage and bandwidth
- requirements of a large Tahoe grid. It provides forms to input things like
- total number of users, number of files per user, average file size, number
- of servers, expansion ratio, hard drive failure rate, etc. It then provides
- numbers like how many disks per server will be needed, how many read
- operations per second should be expected, and the likely MTBF for files in
- the grid. This information is very preliminary, and the model upon which it
- is based still needs a lot of work.
-
  ``GET /helper_status/``
  
   If the node is running a helper (i.e. if [helper]enabled is set to True in
  ``GET /helper_status/``
  
   If the node is running a helper (i.e. if [helper]enabled is set to True in
diff --git a/misc/operations_helpers/provisioning/provisioning.py b/misc/operations_helpers/provisioning/provisioning.py

new file mode 100644 (file)

index 0000000..9d9af0e
--- /dev/null
+++ b/misc/operations_helpers/provisioning/provisioning.py
@@ -0,0 +1,772 @@
+
+from nevow import inevow, rend, tags as T
+import math
+from allmydata.util import mathutil
+from allmydata.web.common import getxmlfile
+
+# factorial and binomial copied from
+# http://mail.python.org/pipermail/python-list/2007-April/435718.html
+
+def factorial(n):
+    """factorial(n): return the factorial of the integer n.
+    factorial(0) = 1
+    factorial(n) with n<0 is -factorial(abs(n))
+    """
+    result = 1
+    for i in xrange(1, abs(n)+1):
+        result *= i
+    assert n >= 0
+    return result
+
+def binomial(n, k):
+    assert 0 <= k <= n
+    if k == 0 or k == n:
+        return 1
+    # calculate n!/k! as one product, avoiding factors that
+    # just get canceled
+    P = k+1
+    for i in xrange(k+2, n+1):
+        P *= i
+    # if you are paranoid:
+    # C, rem = divmod(P, factorial(n-k))
+    # assert rem == 0
+    # return C
+    return P//factorial(n-k)
+
+class ProvisioningTool(rend.Page):
+    addSlash = True
+    docFactory = getxmlfile("provisioning.xhtml")
+
+    def render_forms(self, ctx, data):
+        req = inevow.IRequest(ctx)
+
+        def getarg(name, astype=int):
+            if req.method != "POST":
+                return None
+            if name in req.fields:
+                return astype(req.fields[name].value)
+            return None
+        return self.do_forms(getarg)
+
+
+    def do_forms(self, getarg):
+        filled = getarg("filled", bool)
+
+        def get_and_set(name, options, default=None, astype=int):
+            current_value = getarg(name, astype)
+            i_select = T.select(name=name)
+            for (count, description) in options:
+                count = astype(count)
+                if ((current_value is not None and count == current_value) or
+                    (current_value is None and count == default)):
+                    o = T.option(value=str(count), selected="true")[description]
+                else:
+                    o = T.option(value=str(count))[description]
+                i_select = i_select[o]
+            if current_value is None:
+                current_value = default
+            return current_value, i_select
+
+        sections = {}
+        def add_input(section, text, entry):
+            if section not in sections:
+                sections[section] = []
+            sections[section].extend([T.div[text, ": ", entry], "\n"])
+
+        def add_output(section, entry):
+            if section not in sections:
+                sections[section] = []
+            sections[section].extend([entry, "\n"])
+
+        def build_section(section):
+            return T.fieldset[T.legend[section], sections[section]]
+
+        def number(value, suffix=""):
+            scaling = 1
+            if value < 1:
+                fmt = "%1.2g%s"
+            elif value < 100:
+                fmt = "%.1f%s"
+            elif value < 1000:
+                fmt = "%d%s"
+            elif value < 1e6:
+                fmt = "%.2fk%s"; scaling = 1e3
+            elif value < 1e9:
+                fmt = "%.2fM%s"; scaling = 1e6
+            elif value < 1e12:
+                fmt = "%.2fG%s"; scaling = 1e9
+            elif value < 1e15:
+                fmt = "%.2fT%s"; scaling = 1e12
+            elif value < 1e18:
+                fmt = "%.2fP%s"; scaling = 1e15
+            else:
+                fmt = "huge! %g%s"
+            return fmt % (value / scaling, suffix)
+
+        user_counts = [(5, "5 users"),
+                       (50, "50 users"),
+                       (200, "200 users"),
+                       (1000, "1k users"),
+                       (10000, "10k users"),
+                       (50000, "50k users"),
+                       (100000, "100k users"),
+                       (500000, "500k users"),
+                       (1000000, "1M users"),
+                       ]
+        num_users, i_num_users = get_and_set("num_users", user_counts, 50000)
+        add_input("Users",
+                  "How many users are on this network?", i_num_users)
+
+        files_per_user_counts = [(100, "100 files"),
+                                 (1000, "1k files"),
+                                 (10000, "10k files"),
+                                 (100000, "100k files"),
+                                 (1e6, "1M files"),
+                                 ]
+        files_per_user, i_files_per_user = get_and_set("files_per_user",
+                                                       files_per_user_counts,
+                                                       1000)
+        add_input("Users",
+                  "How many files for each user? (avg)",
+                  i_files_per_user)
+
+        space_per_user_sizes = [(1e6, "1MB"),
+                                (10e6, "10MB"),
+                                (100e6, "100MB"),
+                                (200e6, "200MB"),
+                                (1e9, "1GB"),
+                                (2e9, "2GB"),
+                                (5e9, "5GB"),
+                                (10e9, "10GB"),
+                                (100e9, "100GB"),
+                                (1e12, "1TB"),
+                                (2e12, "2TB"),
+                                (5e12, "5TB"),
+                                ]
+        # Estimate ~5gb per user as a more realistic case
+        space_per_user, i_space_per_user = get_and_set("space_per_user",
+                                                       space_per_user_sizes,
+                                                       5e9)
+        add_input("Users",
+                  "How much data for each user? (avg)",
+                  i_space_per_user)
+
+        sharing_ratios = [(1.0, "1.0x"),
+                          (1.1, "1.1x"),
+                          (2.0, "2.0x"),
+                          ]
+        sharing_ratio, i_sharing_ratio = get_and_set("sharing_ratio",
+                                                     sharing_ratios, 1.0,
+                                                     float)
+        add_input("Users",
+                  "What is the sharing ratio? (1.0x is no-sharing and"
+                  " no convergence)", i_sharing_ratio)
+
+        # Encoding parameters
+        encoding_choices = [("3-of-10-5", "3.3x (3-of-10, repair below 5)"),
+                            ("3-of-10-8", "3.3x (3-of-10, repair below 8)"),
+                            ("5-of-10-7", "2x (5-of-10, repair below 7)"),
+                            ("8-of-10-9", "1.25x (8-of-10, repair below 9)"),
+                            ("27-of-30-28", "1.1x (27-of-30, repair below 28"),
+                            ("25-of-100-50", "4x (25-of-100, repair below 50)"),
+                            ]
+        encoding_parameters, i_encoding_parameters = \
+                             get_and_set("encoding_parameters",
+                                         encoding_choices, "3-of-10-5", str)
+        encoding_pieces = encoding_parameters.split("-")
+        k = int(encoding_pieces[0])
+        assert encoding_pieces[1] == "of"
+        n = int(encoding_pieces[2])
+        # we repair the file when the number of available shares drops below
+        # this value
+        repair_threshold = int(encoding_pieces[3])
+
+        add_input("Servers",
+                  "What are the default encoding parameters?",
+                  i_encoding_parameters)
+
+        # Server info
+        num_server_choices = [ (5, "5 servers"),
+                               (10, "10 servers"),
+                               (15, "15 servers"),
+                               (30, "30 servers"),
+                               (50, "50 servers"),
+                               (100, "100 servers"),
+                               (200, "200 servers"),
+                               (300, "300 servers"),
+                               (500, "500 servers"),
+                               (1000, "1k servers"),
+                               (2000, "2k servers"),
+                               (5000, "5k servers"),
+                               (10e3, "10k servers"),
+                               (100e3, "100k servers"),
+                               (1e6, "1M servers"),
+                               ]
+        num_servers, i_num_servers = \
+                     get_and_set("num_servers", num_server_choices, 30, int)
+        add_input("Servers",
+                  "How many servers are there?", i_num_servers)
+
+        # availability is measured in dBA = -dBF, where 0dBF is 100% failure,
+        # 10dBF is 10% failure, 20dBF is 1% failure, etc
+        server_dBA_choices = [ (10, "90% [10dBA] (2.4hr/day)"),
+                               (13, "95% [13dBA] (1.2hr/day)"),
+                               (20, "99% [20dBA] (14min/day or 3.5days/year)"),
+                               (23, "99.5% [23dBA] (7min/day or 1.75days/year)"),
+                               (30, "99.9% [30dBA] (87sec/day or 9hours/year)"),
+                               (40, "99.99% [40dBA] (60sec/week or 53min/year)"),
+                               (50, "99.999% [50dBA] (5min per year)"),
+                               ]
+        server_dBA, i_server_availability = \
+                    get_and_set("server_availability",
+                                server_dBA_choices,
+                                20, int)
+        add_input("Servers",
+                  "What is the server availability?", i_server_availability)
+
+        drive_MTBF_choices = [ (40, "40,000 Hours"),
+                               ]
+        drive_MTBF, i_drive_MTBF = \
+                    get_and_set("drive_MTBF", drive_MTBF_choices, 40, int)
+        add_input("Drives",
+                  "What is the hard drive MTBF?", i_drive_MTBF)
+        # http://www.tgdaily.com/content/view/30990/113/
+        # http://labs.google.com/papers/disk_failures.pdf
+        # google sees:
+        #  1.7% of the drives they replaced were 0-1 years old
+        #  8% of the drives they repalced were 1-2 years old
+        #  8.6% were 2-3 years old
+        #  6% were 3-4 years old, about 8% were 4-5 years old
+
+        drive_size_choices = [ (100, "100 GB"),
+                               (250, "250 GB"),
+                               (500, "500 GB"),
+                               (750, "750 GB"),
+                               (1000, "1000 GB"),
+                               (2000, "2000 GB"),
+                               (3000, "3000 GB"),
+                               ]
+        drive_size, i_drive_size = \
+                    get_and_set("drive_size", drive_size_choices, 3000, int)
+        drive_size = drive_size * 1e9
+        add_input("Drives",
+                  "What is the capacity of each hard drive?", i_drive_size)
+        drive_failure_model_choices = [ ("E", "Exponential"),
+                                        ("U", "Uniform"),
+                                        ]
+        drive_failure_model, i_drive_failure_model = \
+                             get_and_set("drive_failure_model",
+                                         drive_failure_model_choices,
+                                         "E", str)
+        add_input("Drives",
+                  "How should we model drive failures?", i_drive_failure_model)
+
+        # drive_failure_rate is in failures per second
+        if drive_failure_model == "E":
+            drive_failure_rate = 1.0 / (drive_MTBF * 1000 * 3600)
+        else:
+            drive_failure_rate = 0.5 / (drive_MTBF * 1000 * 3600)
+
+        # deletion/gc/ownership mode
+        ownership_choices = [ ("A", "no deletion, no gc, no owners"),
+                              ("B", "deletion, no gc, no owners"),
+                              ("C", "deletion, share timers, no owners"),
+                              ("D", "deletion, no gc, yes owners"),
+                              ("E", "deletion, owner timers"),
+                              ]
+        ownership_mode, i_ownership_mode = \
+                        get_and_set("ownership_mode", ownership_choices,
+                                    "A", str)
+        add_input("Servers",
+                  "What is the ownership mode?", i_ownership_mode)
+
+        # client access behavior
+        access_rates = [ (1, "one file per day"),
+                         (10, "10 files per day"),
+                         (100, "100 files per day"),
+                         (1000, "1k files per day"),
+                         (10e3, "10k files per day"),
+                         (100e3, "100k files per day"),
+                         ]
+        download_files_per_day, i_download_rate = \
+                                get_and_set("download_rate", access_rates,
+                                            100, int)
+        add_input("Users",
+                  "How many files are downloaded per day?", i_download_rate)
+        download_rate = 1.0 * download_files_per_day / (24*60*60)
+
+        upload_files_per_day, i_upload_rate = \
+                              get_and_set("upload_rate", access_rates,
+                                          10, int)
+        add_input("Users",
+                  "How many files are uploaded per day?", i_upload_rate)
+        upload_rate = 1.0 * upload_files_per_day / (24*60*60)
+
+        delete_files_per_day, i_delete_rate = \
+                              get_and_set("delete_rate", access_rates,
+                                          10, int)
+        add_input("Users",
+                  "How many files are deleted per day?", i_delete_rate)
+        delete_rate = 1.0 * delete_files_per_day / (24*60*60)
+
+
+        # the value is in days
+        lease_timers = [ (1, "one refresh per day"),
+                         (7, "one refresh per week"),
+                         ]
+        lease_timer, i_lease = \
+                     get_and_set("lease_timer", lease_timers,
+                                 7, int)
+        add_input("Users",
+                  "How frequently do clients refresh files or accounts? "
+                  "(if necessary)",
+                  i_lease)
+        seconds_per_lease = 24*60*60*lease_timer
+
+        check_timer_choices = [ (1, "every week"),
+                                (4, "every month"),
+                                (8, "every two months"),
+                                (16, "every four months"),
+                                ]
+        check_timer, i_check_timer = \
+                     get_and_set("check_timer", check_timer_choices, 4, int)
+        add_input("Users",
+                  "How frequently should we check on each file?",
+                  i_check_timer)
+        file_check_interval = check_timer * 7 * 24 * 3600
+
+
+        if filled:
+            add_output("Users", T.div["Total users: %s" % number(num_users)])
+            add_output("Users",
+                       T.div["Files per user: %s" % number(files_per_user)])
+            file_size = 1.0 * space_per_user / files_per_user
+            add_output("Users",
+                       T.div["Average file size: ", number(file_size)])
+            total_files = num_users * files_per_user / sharing_ratio
+
+            add_output("Grid",
+                       T.div["Total number of files in grid: ",
+                             number(total_files)])
+            total_space = num_users * space_per_user / sharing_ratio
+            add_output("Grid",
+                       T.div["Total volume of plaintext in grid: ",
+                             number(total_space, "B")])
+
+            total_shares = n * total_files
+            add_output("Grid",
+                       T.div["Total shares in grid: ", number(total_shares)])
+            expansion = float(n) / float(k)
+
+            total_usage = expansion * total_space
+            add_output("Grid",
+                       T.div["Share data in grid: ", number(total_usage, "B")])
+
+            if n > num_servers:
+                # silly configuration, causes Tahoe2 to wrap and put multiple
+                # shares on some servers.
+                add_output("Servers",
+                           T.div["non-ideal: more shares than servers"
+                                 " (n=%d, servers=%d)" % (n, num_servers)])
+                # every file has at least one share on every server
+                buckets_per_server = total_files
+                shares_per_server = total_files * ((1.0 * n) / num_servers)
+            else:
+                # if nobody is full, then no lease requests will be turned
+                # down for lack of space, and no two shares for the same file
+                # will share a server. Therefore the chance that any given
+                # file has a share on any given server is n/num_servers.
+                buckets_per_server = total_files * ((1.0 * n) / num_servers)
+                # since each such represented file only puts one share on a
+                # server, the total number of shares per server is the same.
+                shares_per_server = buckets_per_server
+            add_output("Servers",
+                       T.div["Buckets per server: ",
+                             number(buckets_per_server)])
+            add_output("Servers",
+                       T.div["Shares per server: ",
+                             number(shares_per_server)])
+
+            # how much space is used on the storage servers for the shares?
+            #  the share data itself
+            share_data_per_server = total_usage / num_servers
+            add_output("Servers",
+                       T.div["Share data per server: ",
+                             number(share_data_per_server, "B")])
+            # this is determined empirically. H=hashsize=32, for a one-segment
+            # file and 3-of-10 encoding
+            share_validation_per_server = 266 * shares_per_server
+            # this could be 423*buckets_per_server, if we moved the URI
+            # extension into a separate file, but that would actually consume
+            # *more* space (minimum filesize is 4KiB), unless we moved all
+            # shares for a given bucket into a single file.
+            share_uri_extension_per_server = 423 * shares_per_server
+
+            # ownership mode adds per-bucket data
+            H = 32 # depends upon the desired security of delete/refresh caps
+            # bucket_lease_size is the amount of data needed to keep track of
+            # the delete/refresh caps for each bucket.
+            bucket_lease_size = 0
+            client_bucket_refresh_rate = 0
+            owner_table_size = 0
+            if ownership_mode in ("B", "C", "D", "E"):
+                bucket_lease_size = sharing_ratio * 1.0 * H
+            if ownership_mode in ("B", "C"):
+                # refreshes per second per client
+                client_bucket_refresh_rate = (1.0 * n * files_per_user /
+                                              seconds_per_lease)
+                add_output("Users",
+                           T.div["Client share refresh rate (outbound): ",
+                                 number(client_bucket_refresh_rate, "Hz")])
+                server_bucket_refresh_rate = (client_bucket_refresh_rate *
+                                              num_users / num_servers)
+                add_output("Servers",
+                           T.div["Server share refresh rate (inbound): ",
+                                 number(server_bucket_refresh_rate, "Hz")])
+            if ownership_mode in ("D", "E"):
+                # each server must maintain a bidirectional mapping from
+                # buckets to owners. One way to implement this would be to
+                # put a list of four-byte owner numbers into each bucket, and
+                # a list of four-byte share numbers into each owner (although
+                # of course we'd really just throw it into a database and let
+                # the experts take care of the details).
+                owner_table_size = 2*(buckets_per_server * sharing_ratio * 4)
+
+            if ownership_mode in ("E",):
+                # in this mode, clients must refresh one timer per server
+                client_account_refresh_rate = (1.0 * num_servers /
+                                               seconds_per_lease)
+                add_output("Users",
+                           T.div["Client account refresh rate (outbound): ",
+                                 number(client_account_refresh_rate, "Hz")])
+                server_account_refresh_rate = (client_account_refresh_rate *
+                                              num_users / num_servers)
+                add_output("Servers",
+                           T.div["Server account refresh rate (inbound): ",
+                                 number(server_account_refresh_rate, "Hz")])
+
+            # TODO: buckets vs shares here is a bit wonky, but in
+            # non-wrapping grids it shouldn't matter
+            share_lease_per_server = bucket_lease_size * buckets_per_server
+            share_ownertable_per_server = owner_table_size
+
+            share_space_per_server = (share_data_per_server +
+                                      share_validation_per_server +
+                                      share_uri_extension_per_server +
+                                      share_lease_per_server +
+                                      share_ownertable_per_server)
+            add_output("Servers",
+                       T.div["Share space per server: ",
+                             number(share_space_per_server, "B"),
+                             " (data ",
+                             number(share_data_per_server, "B"),
+                             ", validation ",
+                             number(share_validation_per_server, "B"),
+                             ", UEB ",
+                             number(share_uri_extension_per_server, "B"),
+                             ", lease ",
+                             number(share_lease_per_server, "B"),
+                             ", ownertable ",
+                             number(share_ownertable_per_server, "B"),
+                             ")",
+                             ])
+
+
+            # rates
+            client_download_share_rate = download_rate * k
+            client_download_byte_rate = download_rate * file_size
+            add_output("Users",
+                       T.div["download rate: shares = ",
+                             number(client_download_share_rate, "Hz"),
+                             " , bytes = ",
+                             number(client_download_byte_rate, "Bps"),
+                             ])
+            total_file_check_rate = 1.0 * total_files / file_check_interval
+            client_check_share_rate = total_file_check_rate / num_users
+            add_output("Users",
+                       T.div["file check rate: shares = ",
+                             number(client_check_share_rate, "Hz"),
+                             " (interval = %s)" %
+                             number(1 / client_check_share_rate, "s"),
+                             ])
+
+            client_upload_share_rate = upload_rate * n
+            # TODO: doesn't include overhead
+            client_upload_byte_rate = upload_rate * file_size * expansion
+            add_output("Users",
+                       T.div["upload rate: shares = ",
+                             number(client_upload_share_rate, "Hz"),
+                             " , bytes = ",
+                             number(client_upload_byte_rate, "Bps"),
+                             ])
+            client_delete_share_rate = delete_rate * n
+
+            server_inbound_share_rate = (client_upload_share_rate *
+                                         num_users / num_servers)
+            server_inbound_byte_rate = (client_upload_byte_rate *
+                                        num_users / num_servers)
+            add_output("Servers",
+                       T.div["upload rate (inbound): shares = ",
+                             number(server_inbound_share_rate, "Hz"),
+                             " , bytes = ",
+                              number(server_inbound_byte_rate, "Bps"),
+                             ])
+            add_output("Servers",
+                       T.div["share check rate (inbound): ",
+                             number(total_file_check_rate * n / num_servers,
+                                    "Hz"),
+                             ])
+
+            server_share_modify_rate = ((client_upload_share_rate +
+                                         client_delete_share_rate) *
+                                         num_users / num_servers)
+            add_output("Servers",
+                       T.div["share modify rate: shares = ",
+                             number(server_share_modify_rate, "Hz"),
+                             ])
+
+            server_outbound_share_rate = (client_download_share_rate *
+                                          num_users / num_servers)
+            server_outbound_byte_rate = (client_download_byte_rate *
+                                         num_users / num_servers)
+            add_output("Servers",
+                       T.div["download rate (outbound): shares = ",
+                             number(server_outbound_share_rate, "Hz"),
+                             " , bytes = ",
+                              number(server_outbound_byte_rate, "Bps"),
+                             ])
+
+
+            total_share_space = num_servers * share_space_per_server
+            add_output("Grid",
+                       T.div["Share space consumed: ",
+                             number(total_share_space, "B")])
+            add_output("Grid",
+                       T.div[" %% validation: %.2f%%" %
+                             (100.0 * share_validation_per_server /
+                              share_space_per_server)])
+            add_output("Grid",
+                       T.div[" %% uri-extension: %.2f%%" %
+                             (100.0 * share_uri_extension_per_server /
+                              share_space_per_server)])
+            add_output("Grid",
+                       T.div[" %% lease data: %.2f%%" %
+                             (100.0 * share_lease_per_server /
+                              share_space_per_server)])
+            add_output("Grid",
+                       T.div[" %% owner data: %.2f%%" %
+                             (100.0 * share_ownertable_per_server /
+                              share_space_per_server)])
+            add_output("Grid",
+                       T.div[" %% share data: %.2f%%" %
+                             (100.0 * share_data_per_server /
+                              share_space_per_server)])
+            add_output("Grid",
+                       T.div["file check rate: ",
+                             number(total_file_check_rate,
+                                    "Hz")])
+
+            total_drives = max(mathutil.div_ceil(int(total_share_space),
+                                                 int(drive_size)),
+                               num_servers)
+            add_output("Drives",
+                       T.div["Total drives: ", number(total_drives), " drives"])
+            drives_per_server = mathutil.div_ceil(total_drives, num_servers)
+            add_output("Servers",
+                       T.div["Drives per server: ", drives_per_server])
+
+            # costs
+            if drive_size == 3000 * 1e9:
+                add_output("Servers", T.div["3000GB drive: $250 each"])
+                drive_cost = 250
+            else:
+                add_output("Servers",
+                           T.div[T.b["unknown cost per drive, assuming $100"]])
+                drive_cost = 100
+
+            if drives_per_server <= 4:
+                add_output("Servers", T.div["1U box with <= 4 drives: $1500"])
+                server_cost = 1500 # typical 1U box
+            elif drives_per_server <= 12:
+                add_output("Servers", T.div["2U box with <= 12 drives: $2500"])
+                server_cost = 2500 # 2U box
+            else:
+                add_output("Servers",
+                           T.div[T.b["Note: too many drives per server, "
+                                     "assuming $3000"]])
+                server_cost = 3000
+
+            server_capital_cost = (server_cost + drives_per_server * drive_cost)
+            total_server_cost = float(num_servers * server_capital_cost)
+            add_output("Servers", T.div["Capital cost per server: $",
+                                        server_capital_cost])
+            add_output("Grid", T.div["Capital cost for all servers: $",
+                                     number(total_server_cost)])
+            # $70/Mbps/mo
+            # $44/server/mo power+space
+            server_bandwidth = max(server_inbound_byte_rate,
+                                   server_outbound_byte_rate)
+            server_bandwidth_mbps = mathutil.div_ceil(int(server_bandwidth*8),
+                                                      int(1e6))
+            server_monthly_cost = 70*server_bandwidth_mbps + 44
+            add_output("Servers", T.div["Monthly cost per server: $",
+                                        server_monthly_cost])
+            add_output("Users", T.div["Capital cost per user: $",
+                                      number(total_server_cost / num_users)])
+
+            # reliability
+            any_drive_failure_rate = total_drives * drive_failure_rate
+            any_drive_MTBF = 1 // any_drive_failure_rate  # in seconds
+            any_drive_MTBF_days = any_drive_MTBF / 86400
+            add_output("Drives",
+                       T.div["MTBF (any drive): ",
+                             number(any_drive_MTBF_days), " days"])
+            drive_replacement_monthly_cost = (float(drive_cost)
+                                              * any_drive_failure_rate
+                                              *30*86400)
+            add_output("Grid",
+                       T.div["Monthly cost of replacing drives: $",
+                             number(drive_replacement_monthly_cost)])
+
+            total_server_monthly_cost = float(num_servers * server_monthly_cost
+                                              + drive_replacement_monthly_cost)
+
+            add_output("Grid", T.div["Monthly cost for all servers: $",
+                                     number(total_server_monthly_cost)])
+            add_output("Users",
+                       T.div["Monthly cost per user: $",
+                             number(total_server_monthly_cost / num_users)])
+
+            # availability
+            file_dBA = self.file_availability(k, n, server_dBA)
+            user_files_dBA = self.many_files_availability(file_dBA,
+                                                          files_per_user)
+            all_files_dBA = self.many_files_availability(file_dBA, total_files)
+            add_output("Users",
+                       T.div["availability of: ",
+                             "arbitrary file = %d dBA, " % file_dBA,
+                             "all files of user1 = %d dBA, " % user_files_dBA,
+                             "all files in grid = %d dBA" % all_files_dBA,
+                             ],
+                       )
+
+            time_until_files_lost = (n-k+1) / any_drive_failure_rate
+            add_output("Grid",
+                       T.div["avg time until files are lost: ",
+                             number(time_until_files_lost, "s"), ", ",
+                             number(time_until_files_lost/86400, " days"),
+                             ])
+
+            share_data_loss_rate = any_drive_failure_rate * drive_size
+            add_output("Grid",
+                       T.div["share data loss rate: ",
+                             number(share_data_loss_rate,"Bps")])
+
+            # the worst-case survival numbers occur when we do a file check
+            # and the file is just above the threshold for repair (so we
+            # decide to not repair it). The question is then: what is the
+            # chance that the file will decay so badly before the next check
+            # that we can't recover it? The resulting probability is per
+            # check interval.
+            # Note that the chances of us getting into this situation are low.
+            P_disk_failure_during_interval = (drive_failure_rate *
+                                              file_check_interval)
+            disk_failure_dBF = 10*math.log10(P_disk_failure_during_interval)
+            disk_failure_dBA = -disk_failure_dBF
+            file_survives_dBA = self.file_availability(k, repair_threshold,
+                                                       disk_failure_dBA)
+            user_files_survives_dBA = self.many_files_availability( \
+                file_survives_dBA, files_per_user)
+            all_files_survives_dBA = self.many_files_availability( \
+                file_survives_dBA, total_files)
+            add_output("Users",
+                       T.div["survival of: ",
+                             "arbitrary file = %d dBA, " % file_survives_dBA,
+                             "all files of user1 = %d dBA, " %
+                             user_files_survives_dBA,
+                             "all files in grid = %d dBA" %
+                             all_files_survives_dBA,
+                             " (per worst-case check interval)",
+                             ])
+
+
+
+        all_sections = []
+        all_sections.append(build_section("Users"))
+        all_sections.append(build_section("Servers"))
+        all_sections.append(build_section("Drives"))
+        if "Grid" in sections:
+            all_sections.append(build_section("Grid"))
+
+        f = T.form(action=".", method="post", enctype="multipart/form-data")
+
+        if filled:
+            action = "Recompute"
+        else:
+            action = "Compute"
+
+        f = f[T.input(type="hidden", name="filled", value="true"),
+              T.input(type="submit", value=action),
+              all_sections,
+              ]
+
+        try:
+            from allmydata import reliability
+            # we import this just to test to see if the page is available
+            _hush_pyflakes = reliability
+            del _hush_pyflakes
+            f = [T.div[T.a(href="../reliability")["Reliability Math"]], f]
+        except ImportError:
+            pass
+
+        return f
+
+    def file_availability(self, k, n, server_dBA):
+        """
+        The full formula for the availability of a specific file is::
+
+         1 - sum([choose(N,i) * p**i * (1-p)**(N-i)] for i in range(k)])
+
+        Where choose(N,i) = N! / ( i! * (N-i)! ) . Note that each term of
+        this summation is the probability that there are exactly 'i' servers
+        available, and what we're doing is adding up the cases where i is too
+        low.
+
+        This is a nuisance to calculate at all accurately, especially once N
+        gets large, and when p is close to unity. So we make an engineering
+        approximation: if (1-p) is very small, then each [i] term is much
+        larger than the [i-1] term, and the sum is dominated by the i=k-1
+        term. This only works for (1-p) < 10%, and when the choose() function
+        doesn't rise fast enough to compensate. For high-expansion encodings
+        (3-of-10, 25-of-100), the choose() function is rising at the same
+        time as the (1-p)**(N-i) term, so that's not an issue. For
+        low-expansion encodings (7-of-10, 75-of-100) the two values are
+        moving in opposite directions, so more care must be taken.
+
+        Note that the p**i term has only a minor effect as long as (1-p)*N is
+        small, and even then the effect is attenuated by the 1-p term.
+        """
+
+        assert server_dBA > 9  # >=90% availability to use the approximation
+        factor = binomial(n, k-1)
+        factor_dBA = 10 * math.log10(factor)
+        exponent = n - k + 1
+        file_dBA = server_dBA * exponent - factor_dBA
+        return file_dBA
+
+    def many_files_availability(self, file_dBA, num_files):
+        """The probability that 'num_files' independent bernoulli trials will
+        succeed (i.e. we can recover all files in the grid at any given
+        moment) is p**num_files . Since p is close to unity, we express in p
+        in dBA instead, so we can get useful precision on q (=1-p), and then
+        the formula becomes::
+
+         P_some_files_unavailable = 1 - (1 - q)**num_files
+
+        That (1-q)**n expands with the usual binomial sequence, 1 - nq +
+        Xq**2 ... + Xq**n . We use the same approximation as before, since we
+        know q is close to zero, and we get to ignore all the terms past -nq.
+        """
+
+        many_files_dBA = file_dBA - 10 * math.log10(num_files)
+        return many_files_dBA
diff --git a/misc/operations_helpers/provisioning/provisioning.xhtml b/misc/operations_helpers/provisioning/provisioning.xhtml

new file mode 100644 (file)

index 0000000..bfa4edb
--- /dev/null
+++ b/misc/operations_helpers/provisioning/provisioning.xhtml
@@ -0,0 +1,18 @@
+<html xmlns:n="http://nevow.com/ns/nevow/0.1">
+  <head>
+    <title>Tahoe-LAFS - Provisioning Tool</title>
+    <link href="/tahoe.css" rel="stylesheet" type="text/css"/>
+    <link href="/icon.png" rel="shortcut icon" />
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+  </head>
+  <body>
+
+<h1>Tahoe-LAFS Provisioning Tool</h1>
+
+<p>This page will help you determine how much disk space and network
+bandwidth will be required by various sizes and types of Tahoe-LAFS networks.</p>
+
+<div n:render="forms" />
+
+  </body>
+</html>
diff --git a/misc/operations_helpers/provisioning/reliability.py b/misc/operations_helpers/provisioning/reliability.py

new file mode 100644 (file)

index 0000000..a0d6076
--- /dev/null
+++ b/misc/operations_helpers/provisioning/reliability.py
@@ -0,0 +1,251 @@
+#! /usr/bin/python
+
+import math
+from allmydata.util import statistics
+from numpy import array, matrix, dot
+
+DAY=24*60*60
+MONTH=31*DAY
+YEAR=365*DAY
+
+class ReliabilityModel:
+    """Generate a model of system-wide reliability, given several input
+    parameters.
+
+    This runs a simulation in which time is quantized down to 'delta' seconds
+    (default is one month): a smaller delta will result in a more accurate
+    simulation, but will take longer to run. 'report_span' simulated seconds
+    will be run.
+
+    The encoding parameters are provided as 'k' (minimum number of shares
+    needed to recover the file) and 'N' (total number of shares generated).
+    The default parameters are 3-of-10.
+
+    The first step is to build a probability of individual drive loss during
+    any given delta. This uses a simple exponential model, in which the
+    average drive lifetime is specified by the 'drive_lifetime' parameter
+    (default is 8 years).
+
+    The second step is to calculate a 'transition matrix': a table of
+    probabilities that shows, given A shares at the start of the delta, what
+    the chances are of having B shares left at the end of the delta. The
+    current code optimistically assumes all drives are independent. A
+    subclass could override that assumption.
+
+    An additional 'repair matrix' is created to show what happens when the
+    Checker/Repairer is run. In the simulation, the Checker will be run every
+    'check_period' seconds (default is one month), and the Repairer will be
+    run if it sees fewer than 'R' shares (default 7).
+
+    The third step is to finally run the simulation. An initial probability
+    vector is created (with a 100% chance of N shares and a 0% chance of
+    fewer than N shares), then it is multiplied by the transition matrix for
+    every delta of time. Each time the Checker is to be run, the repair
+    matrix is multiplied in, and some additional stats are accumulated
+    (average number of repairs that occur, average number of shares
+    regenerated per repair).
+
+    The output is a ReliabilityReport instance, which contains a table that
+    samples the state of the simulation once each 'report_period' seconds
+    (defaults to 3 months). Each row of this table will contain the
+    probability vector for one sample period (chance of having X shares, from
+    0 to N, at the end of the period). The report will also contain other
+    information.
+
+    """
+
+    @classmethod
+    def run(klass,
+            drive_lifetime=8*YEAR,
+            k=3, R=7, N=10,
+            delta=1*MONTH,
+            check_period=1*MONTH,
+            report_period=3*MONTH,
+            report_span=5*YEAR,
+            ):
+        self = klass()
+
+        check_period = check_period-1
+        P = self.p_in_period(drive_lifetime, delta)
+
+        decay = self.build_decay_matrix(N, P)
+
+        repair = self.build_repair_matrix(k, N, R)
+
+        #print "DECAY:", decay
+        #print "OLD-POST-REPAIR:", old_post_repair
+        #print "NEW-POST-REPAIR:", decay * repair
+        #print "REPAIR:", repair
+        #print "DIFF:", (old_post_repair - decay * repair)
+
+        START = array([0]*N + [1])
+        DEAD = array([1]*k + [0]*(1+N-k))
+        REPAIRp = array([0]*k + [1]*(R-k) + [0]*(1+N-R))
+        REPAIR_newshares = array([0]*k +
+                                 [N-i for i in range(k, R)] +
+                                 [0]*(1+N-R))
+        assert REPAIR_newshares.shape[0] == N+1
+        #print "START", START
+        #print "REPAIRp", REPAIRp
+        #print "REPAIR_newshares", REPAIR_newshares
+
+        unmaintained_state = START
+        maintained_state = START
+        last_check = 0
+        last_report = 0
+        P_repaired_last_check_period = 0.0
+        needed_repairs = []
+        needed_new_shares = []
+        report = ReliabilityReport()
+
+        for t in range(0, report_span+delta, delta):
+            # the .A[0] turns the one-row matrix back into an array
+            unmaintained_state = (unmaintained_state * decay).A[0]
+            maintained_state = (maintained_state * decay).A[0]
+            if (t-last_check) > check_period:
+                last_check = t
+                # we do a check-and-repair this frequently
+                need_repair = dot(maintained_state, REPAIRp)
+
+                P_repaired_last_check_period = need_repair
+                new_shares = dot(maintained_state, REPAIR_newshares)
+                needed_repairs.append(need_repair)
+                needed_new_shares.append(new_shares)
+
+                maintained_state = (maintained_state * repair).A[0]
+
+            if (t-last_report) > report_period:
+                last_report = t
+                P_dead_unmaintained = dot(unmaintained_state, DEAD)
+                P_dead_maintained = dot(maintained_state, DEAD)
+                cumulative_number_of_repairs = sum(needed_repairs)
+                cumulative_number_of_new_shares = sum(needed_new_shares)
+                report.add_sample(t, unmaintained_state, maintained_state,
+                                  P_repaired_last_check_period,
+                                  cumulative_number_of_repairs,
+                                  cumulative_number_of_new_shares,
+                                  P_dead_unmaintained, P_dead_maintained)
+
+        # record one more sample at the end of the run
+        P_dead_unmaintained = dot(unmaintained_state, DEAD)
+        P_dead_maintained = dot(maintained_state, DEAD)
+        cumulative_number_of_repairs = sum(needed_repairs)
+        cumulative_number_of_new_shares = sum(needed_new_shares)
+        report.add_sample(t, unmaintained_state, maintained_state,
+                          P_repaired_last_check_period,
+                          cumulative_number_of_repairs,
+                          cumulative_number_of_new_shares,
+                          P_dead_unmaintained, P_dead_maintained)
+
+        #def yandm(seconds):
+        #    return "%dy.%dm" % (int(seconds/YEAR), int( (seconds%YEAR)/MONTH))
+        #needed_repairs_total = sum(needed_repairs)
+        #needed_new_shares_total = sum(needed_new_shares)
+        #print "at 2y:"
+        #print " unmaintained", unmaintained_state
+        #print " maintained", maintained_state
+        #print " number of repairs", needed_repairs_total
+        #print " new shares generated", needed_new_shares_total
+        #repair_rate_inv = report_span / needed_repairs_total
+        #print "  avg repair rate: once every %s" % yandm(repair_rate_inv)
+        #print "  avg repair download: one share every %s" % yandm(repair_rate_inv/k)
+        #print "  avg repair upload: one share every %s" % yandm(report_span / needed_new_shares_total)
+
+        return report
+
+    def p_in_period(self, avg_lifetime, period):
+        """Given an average lifetime of a disk (using an exponential model),
+        what is the chance that a live disk will survive the next 'period'
+        seconds?"""
+
+        # eg p_in_period(8*YEAR, MONTH) = 98.94%
+        return math.exp(-1.0*period/avg_lifetime)
+
+    def build_decay_matrix(self, N, P):
+        """Return a decay matrix. decay[start_shares][end_shares] is the
+        conditional probability of finishing with end_shares, given that we
+        started with start_shares."""
+        decay_rows = []
+        decay_rows.append( [0.0]*(N+1) )
+        for start_shares in range(1, (N+1)):
+            end_shares = self.build_decay_row(start_shares, P)
+            decay_row = end_shares + [0.0] * (N-start_shares)
+            assert len(decay_row) == (N+1), len(decay_row)
+            decay_rows.append(decay_row)
+
+        decay = matrix(decay_rows)
+        return decay
+
+    def build_decay_row(self, start_shares, P):
+        """Return a decay row 'end_shares'. end_shares[i] is the chance that
+        we finish with i shares, given that we started with start_shares, for
+        all i between 0 and start_shares, inclusive. This implementation
+        assumes that all shares are independent (IID), but a more complex
+        model could incorporate inter-share failure correlations like having
+        two shares on the same server."""
+        end_shares = statistics.binomial_distribution_pmf(start_shares, P)
+        return end_shares
+
+    def build_repair_matrix(self, k, N, R):
+        """Return a repair matrix. repair[start][end]: is the conditional
+        probability of the repairer finishing with 'end' shares, given that
+        it began with 'start' shares (repair if fewer than R shares). The
+        repairer's behavior is deterministic, so all values in this matrix
+        are either 0 or 1. This matrix should be applied *after* the decay
+        matrix."""
+        new_repair_rows = []
+        for start_shares in range(0, N+1):
+            new_repair_row = [0] * (N+1)
+            if start_shares < k:
+                new_repair_row[start_shares] = 1
+            elif start_shares < R:
+                new_repair_row[N] = 1
+            else:
+                new_repair_row[start_shares] = 1
+            new_repair_rows.append(new_repair_row)
+
+        repair = matrix(new_repair_rows)
+        return repair
+
+class ReliabilityReport:
+    def __init__(self):
+        self.samples = []
+
+    def add_sample(self, when, unmaintained_shareprobs, maintained_shareprobs,
+                   P_repaired_last_check_period,
+                   cumulative_number_of_repairs,
+                   cumulative_number_of_new_shares,
+                   P_dead_unmaintained, P_dead_maintained):
+        """
+        when: the timestamp at the end of the report period
+        unmaintained_shareprobs: a vector of probabilities, element[S]
+                                 is the chance that there are S shares
+                                 left at the end of the report period.
+                                 This tracks what happens if no repair
+                                 is ever done.
+        maintained_shareprobs: same, but for 'maintained' grids, where
+                               check and repair is done at the end
+                               of each check period
+        P_repaired_last_check_period: a float, with the probability
+                                      that a repair was performed
+                                      at the end of the most recent
+                                      check period.
+        cumulative_number_of_repairs: a float, with the average number
+                                      of repairs that will have been
+                                      performed by the end of the
+                                      report period
+        cumulative_number_of_new_shares: a float, with the average number
+                                         of new shares that repair proceses
+                                         generated by the end of the report
+                                         period
+        P_dead_unmaintained: a float, with the chance that the file will
+                             be unrecoverable at the end of the period
+        P_dead_maintained: same, but for maintained grids
+
+        """
+        row = (when, unmaintained_shareprobs, maintained_shareprobs,
+               P_repaired_last_check_period,
+               cumulative_number_of_repairs,
+               cumulative_number_of_new_shares,
+               P_dead_unmaintained, P_dead_maintained)
+        self.samples.append(row)
diff --git a/misc/operations_helpers/provisioning/reliability.xhtml b/misc/operations_helpers/provisioning/reliability.xhtml

new file mode 100644 (file)

index 0000000..f8d93d1
--- /dev/null
+++ b/misc/operations_helpers/provisioning/reliability.xhtml
@@ -0,0 +1,63 @@
+<html xmlns:n="http://nevow.com/ns/nevow/0.1">
+  <head>
+    <title>Tahoe-LAFS - Reliability Tool</title>
+    <link href="/tahoe.css" rel="stylesheet" type="text/css"/>
+    <link href="/icon.png" rel="shortcut icon" />
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+  </head>
+  <body>
+
+<h1>Tahoe-LAFS Reliability Tool</h1>
+
+<p>Given certain assumptions, this page calculates probability of share loss
+over time, to help make informed decisions about how much redundancy and
+repair bandwidth to configure on a Tahoe-LAFS grid.</p>
+
+<div n:render="forms" />
+
+<h2>Simulation Results</h2>
+
+<p>At the end of the report span (elapsed time <span n:render="report_span"
+/>), the simulated file had the following properties:</p>
+
+<ul>
+    <li>Probability of loss (no maintenance):
+        <span n:render="P_loss_unmaintained"/></li>
+    <li>Probability of loss (with maintenance):
+        <span n:render="P_loss_maintained"/></li>
+    <li>Average repair frequency:
+        once every <span n:render="P_repair_rate"/> secs</li>
+    <li>Average shares generated per repair:
+        <span n:render="P_repair_shares"/></li>
+</ul>
+
+<p>This table shows how the following properties change over time:</p>
+<ul>
+  <li>P_repair: the chance that a repair was performed in the most recent
+  check period.</li>
+  <li>P_dead (unmaintained): the chance that the file will be unrecoverable
+  without periodic check+repair</li>
+  <li>P_dead (maintained): the chance that the file will be unrecoverable even
+  with periodic check+repair</li>
+</ul>
+
+<div>
+<table n:render="sequence" n:data="simulation_table">
+  <tr n:pattern="header">
+    <td>t</td>
+    <td>P_repair</td>
+    <td>P_dead (unmaintained)</td>
+    <td>P_dead (maintained)</td>
+  </tr>
+  <tr n:pattern="item" n:render="simulation_row">
+    <td><n:slot name="t"/></td>
+    <td><n:slot name="P_repair"/></td>
+    <td><n:slot name="P_dead_unmaintained"/></td>
+    <td><n:slot name="P_dead_maintained"/></td>
+  </tr>
+  <tr n:pattern="empty"><td>no simulation data!</td></tr>
+</table>
+</div>
+
+  </body>
+</html>
diff --git a/misc/operations_helpers/provisioning/test_provisioning.py b/misc/operations_helpers/provisioning/test_provisioning.py

new file mode 100644 (file)

index 0000000..71bc657
--- /dev/null
+++ b/misc/operations_helpers/provisioning/test_provisioning.py
@@ -0,0 +1,113 @@
+
+from twisted.trial import unittest
+from allmydata import provisioning
+ReliabilityModel = None
+try:
+    from allmydata.reliability import ReliabilityModel
+except ImportError:
+    pass # might not be importable, since it needs NumPy
+
+from nevow import inevow
+from zope.interface import implements
+
+class MyRequest:
+    implements(inevow.IRequest)
+    pass
+
+class Provisioning(unittest.TestCase):
+    def getarg(self, name, astype=int):
+        if name in self.fields:
+            return astype(self.fields[name])
+        return None
+
+    def test_load(self):
+        pt = provisioning.ProvisioningTool()
+        self.fields = {}
+        #r = MyRequest()
+        #r.fields = self.fields
+        #ctx = RequestContext()
+        #unfilled = pt.renderSynchronously(ctx)
+        lots_of_stan = pt.do_forms(self.getarg)
+        self.failUnless(lots_of_stan is not None)
+
+        self.fields = {'filled': True,
+                       "num_users": 50e3,
+                       "files_per_user": 1000,
+                       "space_per_user": 1e9,
+                       "sharing_ratio": 1.0,
+                       "encoding_parameters": "3-of-10-5",
+                       "num_servers": 30,
+                       "ownership_mode": "A",
+                       "download_rate": 100,
+                       "upload_rate": 10,
+                       "delete_rate": 10,
+                       "lease_timer": 7,
+                       }
+        #filled = pt.renderSynchronously(ctx)
+        more_stan = pt.do_forms(self.getarg)
+        self.failUnless(more_stan is not None)
+
+        # trigger the wraparound configuration
+        self.fields["num_servers"] = 5
+        #filled = pt.renderSynchronously(ctx)
+        more_stan = pt.do_forms(self.getarg)
+
+        # and other ownership modes
+        self.fields["ownership_mode"] = "B"
+        more_stan = pt.do_forms(self.getarg)
+        self.fields["ownership_mode"] = "E"
+        more_stan = pt.do_forms(self.getarg)
+
+    def test_provisioning_math(self):
+        self.failUnlessEqual(provisioning.binomial(10, 0), 1)
+        self.failUnlessEqual(provisioning.binomial(10, 1), 10)
+        self.failUnlessEqual(provisioning.binomial(10, 2), 45)
+        self.failUnlessEqual(provisioning.binomial(10, 9), 10)
+        self.failUnlessEqual(provisioning.binomial(10, 10), 1)
+
+DAY=24*60*60
+MONTH=31*DAY
+YEAR=365*DAY
+
+class Reliability(unittest.TestCase):
+    def test_basic(self):
+        if ReliabilityModel is None:
+            raise unittest.SkipTest("reliability model requires NumPy")
+
+        # test that numpy math works the way I think it does
+        import numpy
+        decay = numpy.matrix([[1,0,0],
+                             [.1,.9,0],
+                             [.01,.09,.9],
+                             ])
+        start = numpy.array([0,0,1])
+        g2 = (start * decay).A[0]
+        self.failUnlessEqual(repr(g2), repr(numpy.array([.01,.09,.9])))
+        g3 = (g2 * decay).A[0]
+        self.failUnlessEqual(repr(g3), repr(numpy.array([.028,.162,.81])))
+
+        # and the dot product
+        recoverable = numpy.array([0,1,1])
+        P_recoverable_g2 = numpy.dot(g2, recoverable)
+        self.failUnlessAlmostEqual(P_recoverable_g2, .9 + .09)
+        P_recoverable_g3 = numpy.dot(g3, recoverable)
+        self.failUnlessAlmostEqual(P_recoverable_g3, .81 + .162)
+
+        r = ReliabilityModel.run(delta=100000,
+                                 report_period=3*MONTH,
+                                 report_span=5*YEAR)
+        self.failUnlessEqual(len(r.samples), 20)
+
+        last_row = r.samples[-1]
+        #print last_row
+        (when, unmaintained_shareprobs, maintained_shareprobs,
+         P_repaired_last_check_period,
+         cumulative_number_of_repairs,
+         cumulative_number_of_new_shares,
+         P_dead_unmaintained, P_dead_maintained) = last_row
+        self.failUnless(isinstance(P_repaired_last_check_period, float))
+        self.failUnless(isinstance(P_dead_unmaintained, float))
+        self.failUnless(isinstance(P_dead_maintained, float))
+        self.failUnlessAlmostEqual(P_dead_unmaintained, 0.033591004555395272)
+        self.failUnlessAlmostEqual(P_dead_maintained, 3.2983995819177542e-08)
+
diff --git a/misc/operations_helpers/provisioning/web_reliability.py b/misc/operations_helpers/provisioning/web_reliability.py

new file mode 100644 (file)

index 0000000..d5d3406
--- /dev/null
+++ b/misc/operations_helpers/provisioning/web_reliability.py
@@ -0,0 +1,152 @@
+
+from nevow import rend, tags as T
+reliability = None # might not be usable
+try:
+    from allmydata import reliability # requires NumPy
+except ImportError:
+    pass
+from allmydata.web.common import getxmlfile, get_arg
+
+
+DAY=24*60*60
+MONTH=31*DAY
+YEAR=365*DAY
+
+def is_available():
+    if reliability:
+        return True
+    return False
+
+def yandm(seconds):
+    return "%dy.%dm" % (int(seconds/YEAR), int( (seconds%YEAR)/MONTH))
+
+class ReliabilityTool(rend.Page):
+    addSlash = True
+    docFactory = getxmlfile("reliability.xhtml")
+
+    DEFAULT_PARAMETERS = [
+        ("drive_lifetime", "8Y", "time",
+         "Average drive lifetime"),
+        ("k", 3, "int",
+         "Minimum number of shares needed to recover the file"),
+        ("R", 7, "int",
+         "Repair threshold: repair will not occur until fewer than R shares "
+         "are left"),
+        ("N", 10, "int",
+         "Total number of shares of the file generated"),
+        ("delta", "1M", "time", "Amount of time between each simulation step"),
+        ("check_period", "1M", "time",
+         "How often to run the checker and repair if fewer than R shares"),
+        ("report_period", "3M", "time",
+         "Amount of time between result rows in this report"),
+        ("report_span", "5Y", "time",
+         "Total amount of time covered by this report"),
+        ]
+
+    def parse_time(self, s):
+        if s.endswith("M"):
+            return int(s[:-1]) * MONTH
+        if s.endswith("Y"):
+            return int(s[:-1]) * YEAR
+        return int(s)
+
+    def format_time(self, s):
+        if s%YEAR == 0:
+            return "%dY" % (s/YEAR)
+        if s%MONTH == 0:
+            return "%dM" % (s/MONTH)
+        return "%d" % s
+
+    def get_parameters(self, ctx):
+        parameters = {}
+        for (name,default,argtype,description) in self.DEFAULT_PARAMETERS:
+            v = get_arg(ctx, name, default)
+            if argtype == "time":
+                value = self.parse_time(v)
+            else:
+                value = int(v)
+            parameters[name] = value
+        return parameters
+
+    def renderHTTP(self, ctx):
+        self.parameters = self.get_parameters(ctx)
+        self.results = reliability.ReliabilityModel.run(**self.parameters)
+        return rend.Page.renderHTTP(self, ctx)
+
+    def make_input(self, name, old_value):
+        return T.input(name=name, type="text", size="5",
+                       value=self.format_time(old_value))
+
+    def render_forms(self, ctx, data):
+        f = T.form(action=".", method="get")
+        table = []
+        for (name,default_value,argtype,description) in self.DEFAULT_PARAMETERS:
+            old_value = self.parameters[name]
+            i = self.make_input(name, old_value)
+            table.append(T.tr[T.td[name+":"], T.td[i], T.td[description]])
+        go = T.input(type="submit", value="Recompute")
+        return [T.h2["Simulation Parameters:"],
+                f[T.table[table], go],
+                ]
+
+    def data_simulation_table(self, ctx, data):
+        for row in self.results.samples:
+            yield row
+
+    def render_simulation_row(self, ctx, row):
+        (when, unmaintained_shareprobs, maintained_shareprobs,
+         P_repaired_last_check_period,
+         cumulative_number_of_repairs,
+         cumulative_number_of_new_shares,
+         P_dead_unmaintained, P_dead_maintained) = row
+        ctx.fillSlots("t", yandm(when))
+        ctx.fillSlots("P_repair", "%.6f" % P_repaired_last_check_period)
+        ctx.fillSlots("P_dead_unmaintained", "%.6g" % P_dead_unmaintained)
+        ctx.fillSlots("P_dead_maintained", "%.6g" % P_dead_maintained)
+        return ctx.tag
+
+    def render_report_span(self, ctx, row):
+        (when, unmaintained_shareprobs, maintained_shareprobs,
+         P_repaired_last_check_period,
+         cumulative_number_of_repairs,
+         cumulative_number_of_new_shares,
+         P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1]
+        return ctx.tag[yandm(when)]
+
+    def render_P_loss_unmaintained(self, ctx, row):
+        (when, unmaintained_shareprobs, maintained_shareprobs,
+         P_repaired_last_check_period,
+         cumulative_number_of_repairs,
+         cumulative_number_of_new_shares,
+         P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1]
+        return ctx.tag["%.6g (%1.8f%%)" % (P_dead_unmaintained,
+                                           100*P_dead_unmaintained)]
+
+    def render_P_loss_maintained(self, ctx, row):
+        (when, unmaintained_shareprobs, maintained_shareprobs,
+         P_repaired_last_check_period,
+         cumulative_number_of_repairs,
+         cumulative_number_of_new_shares,
+         P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1]
+        return ctx.tag["%.6g (%1.8f%%)" % (P_dead_maintained,
+                                           100*P_dead_maintained)]
+
+    def render_P_repair_rate(self, ctx, row):
+        (when, unmaintained_shareprobs, maintained_shareprobs,
+         P_repaired_last_check_period,
+         cumulative_number_of_repairs,
+         cumulative_number_of_new_shares,
+         P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1]
+        freq = when / cumulative_number_of_repairs
+        return ctx.tag["%.6g" % freq]
+
+    def render_P_repair_shares(self, ctx, row):
+        (when, unmaintained_shareprobs, maintained_shareprobs,
+         P_repaired_last_check_period,
+         cumulative_number_of_repairs,
+         cumulative_number_of_new_shares,
+         P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1]
+        generated_shares = cumulative_number_of_new_shares / cumulative_number_of_repairs
+        return ctx.tag["%1.2f" % generated_shares]
+
+
diff --git a/src/allmydata/provisioning.py b/src/allmydata/provisioning.py

deleted file mode 100644 (file)

index 9d9af0e..0000000
--- a/src/allmydata/provisioning.py
+++ /dev/null
@@ -1,772 +0,0 @@
-
-from nevow import inevow, rend, tags as T
-import math
-from allmydata.util import mathutil
-from allmydata.web.common import getxmlfile
-
-# factorial and binomial copied from
-# http://mail.python.org/pipermail/python-list/2007-April/435718.html
-
-def factorial(n):
-    """factorial(n): return the factorial of the integer n.
-    factorial(0) = 1
-    factorial(n) with n<0 is -factorial(abs(n))
-    """
-    result = 1
-    for i in xrange(1, abs(n)+1):
-        result *= i
-    assert n >= 0
-    return result
-
-def binomial(n, k):
-    assert 0 <= k <= n
-    if k == 0 or k == n:
-        return 1
-    # calculate n!/k! as one product, avoiding factors that
-    # just get canceled
-    P = k+1
-    for i in xrange(k+2, n+1):
-        P *= i
-    # if you are paranoid:
-    # C, rem = divmod(P, factorial(n-k))
-    # assert rem == 0
-    # return C
-    return P//factorial(n-k)
-
-class ProvisioningTool(rend.Page):
-    addSlash = True
-    docFactory = getxmlfile("provisioning.xhtml")
-
-    def render_forms(self, ctx, data):
-        req = inevow.IRequest(ctx)
-
-        def getarg(name, astype=int):
-            if req.method != "POST":
-                return None
-            if name in req.fields:
-                return astype(req.fields[name].value)
-            return None
-        return self.do_forms(getarg)
-
-
-    def do_forms(self, getarg):
-        filled = getarg("filled", bool)
-
-        def get_and_set(name, options, default=None, astype=int):
-            current_value = getarg(name, astype)
-            i_select = T.select(name=name)
-            for (count, description) in options:
-                count = astype(count)
-                if ((current_value is not None and count == current_value) or
-                    (current_value is None and count == default)):
-                    o = T.option(value=str(count), selected="true")[description]
-                else:
-                    o = T.option(value=str(count))[description]
-                i_select = i_select[o]
-            if current_value is None:
-                current_value = default
-            return current_value, i_select
-
-        sections = {}
-        def add_input(section, text, entry):
-            if section not in sections:
-                sections[section] = []
-            sections[section].extend([T.div[text, ": ", entry], "\n"])
-
-        def add_output(section, entry):
-            if section not in sections:
-                sections[section] = []
-            sections[section].extend([entry, "\n"])
-
-        def build_section(section):
-            return T.fieldset[T.legend[section], sections[section]]
-
-        def number(value, suffix=""):
-            scaling = 1
-            if value < 1:
-                fmt = "%1.2g%s"
-            elif value < 100:
-                fmt = "%.1f%s"
-            elif value < 1000:
-                fmt = "%d%s"
-            elif value < 1e6:
-                fmt = "%.2fk%s"; scaling = 1e3
-            elif value < 1e9:
-                fmt = "%.2fM%s"; scaling = 1e6
-            elif value < 1e12:
-                fmt = "%.2fG%s"; scaling = 1e9
-            elif value < 1e15:
-                fmt = "%.2fT%s"; scaling = 1e12
-            elif value < 1e18:
-                fmt = "%.2fP%s"; scaling = 1e15
-            else:
-                fmt = "huge! %g%s"
-            return fmt % (value / scaling, suffix)
-
-        user_counts = [(5, "5 users"),
-                       (50, "50 users"),
-                       (200, "200 users"),
-                       (1000, "1k users"),
-                       (10000, "10k users"),
-                       (50000, "50k users"),
-                       (100000, "100k users"),
-                       (500000, "500k users"),
-                       (1000000, "1M users"),
-                       ]
-        num_users, i_num_users = get_and_set("num_users", user_counts, 50000)
-        add_input("Users",
-                  "How many users are on this network?", i_num_users)
-
-        files_per_user_counts = [(100, "100 files"),
-                                 (1000, "1k files"),
-                                 (10000, "10k files"),
-                                 (100000, "100k files"),
-                                 (1e6, "1M files"),
-                                 ]
-        files_per_user, i_files_per_user = get_and_set("files_per_user",
-                                                       files_per_user_counts,
-                                                       1000)
-        add_input("Users",
-                  "How many files for each user? (avg)",
-                  i_files_per_user)
-
-        space_per_user_sizes = [(1e6, "1MB"),
-                                (10e6, "10MB"),
-                                (100e6, "100MB"),
-                                (200e6, "200MB"),
-                                (1e9, "1GB"),
-                                (2e9, "2GB"),
-                                (5e9, "5GB"),
-                                (10e9, "10GB"),
-                                (100e9, "100GB"),
-                                (1e12, "1TB"),
-                                (2e12, "2TB"),
-                                (5e12, "5TB"),
-                                ]
-        # Estimate ~5gb per user as a more realistic case
-        space_per_user, i_space_per_user = get_and_set("space_per_user",
-                                                       space_per_user_sizes,
-                                                       5e9)
-        add_input("Users",
-                  "How much data for each user? (avg)",
-                  i_space_per_user)
-
-        sharing_ratios = [(1.0, "1.0x"),
-                          (1.1, "1.1x"),
-                          (2.0, "2.0x"),
-                          ]
-        sharing_ratio, i_sharing_ratio = get_and_set("sharing_ratio",
-                                                     sharing_ratios, 1.0,
-                                                     float)
-        add_input("Users",
-                  "What is the sharing ratio? (1.0x is no-sharing and"
-                  " no convergence)", i_sharing_ratio)
-
-        # Encoding parameters
-        encoding_choices = [("3-of-10-5", "3.3x (3-of-10, repair below 5)"),
-                            ("3-of-10-8", "3.3x (3-of-10, repair below 8)"),
-                            ("5-of-10-7", "2x (5-of-10, repair below 7)"),
-                            ("8-of-10-9", "1.25x (8-of-10, repair below 9)"),
-                            ("27-of-30-28", "1.1x (27-of-30, repair below 28"),
-                            ("25-of-100-50", "4x (25-of-100, repair below 50)"),
-                            ]
-        encoding_parameters, i_encoding_parameters = \
-                             get_and_set("encoding_parameters",
-                                         encoding_choices, "3-of-10-5", str)
-        encoding_pieces = encoding_parameters.split("-")
-        k = int(encoding_pieces[0])
-        assert encoding_pieces[1] == "of"
-        n = int(encoding_pieces[2])
-        # we repair the file when the number of available shares drops below
-        # this value
-        repair_threshold = int(encoding_pieces[3])
-
-        add_input("Servers",
-                  "What are the default encoding parameters?",
-                  i_encoding_parameters)
-
-        # Server info
-        num_server_choices = [ (5, "5 servers"),
-                               (10, "10 servers"),
-                               (15, "15 servers"),
-                               (30, "30 servers"),
-                               (50, "50 servers"),
-                               (100, "100 servers"),
-                               (200, "200 servers"),
-                               (300, "300 servers"),
-                               (500, "500 servers"),
-                               (1000, "1k servers"),
-                               (2000, "2k servers"),
-                               (5000, "5k servers"),
-                               (10e3, "10k servers"),
-                               (100e3, "100k servers"),
-                               (1e6, "1M servers"),
-                               ]
-        num_servers, i_num_servers = \
-                     get_and_set("num_servers", num_server_choices, 30, int)
-        add_input("Servers",
-                  "How many servers are there?", i_num_servers)
-
-        # availability is measured in dBA = -dBF, where 0dBF is 100% failure,
-        # 10dBF is 10% failure, 20dBF is 1% failure, etc
-        server_dBA_choices = [ (10, "90% [10dBA] (2.4hr/day)"),
-                               (13, "95% [13dBA] (1.2hr/day)"),
-                               (20, "99% [20dBA] (14min/day or 3.5days/year)"),
-                               (23, "99.5% [23dBA] (7min/day or 1.75days/year)"),
-                               (30, "99.9% [30dBA] (87sec/day or 9hours/year)"),
-                               (40, "99.99% [40dBA] (60sec/week or 53min/year)"),
-                               (50, "99.999% [50dBA] (5min per year)"),
-                               ]
-        server_dBA, i_server_availability = \
-                    get_and_set("server_availability",
-                                server_dBA_choices,
-                                20, int)
-        add_input("Servers",
-                  "What is the server availability?", i_server_availability)
-
-        drive_MTBF_choices = [ (40, "40,000 Hours"),
-                               ]
-        drive_MTBF, i_drive_MTBF = \
-                    get_and_set("drive_MTBF", drive_MTBF_choices, 40, int)
-        add_input("Drives",
-                  "What is the hard drive MTBF?", i_drive_MTBF)
-        # http://www.tgdaily.com/content/view/30990/113/
-        # http://labs.google.com/papers/disk_failures.pdf
-        # google sees:
-        #  1.7% of the drives they replaced were 0-1 years old
-        #  8% of the drives they repalced were 1-2 years old
-        #  8.6% were 2-3 years old
-        #  6% were 3-4 years old, about 8% were 4-5 years old
-
-        drive_size_choices = [ (100, "100 GB"),
-                               (250, "250 GB"),
-                               (500, "500 GB"),
-                               (750, "750 GB"),
-                               (1000, "1000 GB"),
-                               (2000, "2000 GB"),
-                               (3000, "3000 GB"),
-                               ]
-        drive_size, i_drive_size = \
-                    get_and_set("drive_size", drive_size_choices, 3000, int)
-        drive_size = drive_size * 1e9
-        add_input("Drives",
-                  "What is the capacity of each hard drive?", i_drive_size)
-        drive_failure_model_choices = [ ("E", "Exponential"),
-                                        ("U", "Uniform"),
-                                        ]
-        drive_failure_model, i_drive_failure_model = \
-                             get_and_set("drive_failure_model",
-                                         drive_failure_model_choices,
-                                         "E", str)
-        add_input("Drives",
-                  "How should we model drive failures?", i_drive_failure_model)
-
-        # drive_failure_rate is in failures per second
-        if drive_failure_model == "E":
-            drive_failure_rate = 1.0 / (drive_MTBF * 1000 * 3600)
-        else:
-            drive_failure_rate = 0.5 / (drive_MTBF * 1000 * 3600)
-
-        # deletion/gc/ownership mode
-        ownership_choices = [ ("A", "no deletion, no gc, no owners"),
-                              ("B", "deletion, no gc, no owners"),
-                              ("C", "deletion, share timers, no owners"),
-                              ("D", "deletion, no gc, yes owners"),
-                              ("E", "deletion, owner timers"),
-                              ]
-        ownership_mode, i_ownership_mode = \
-                        get_and_set("ownership_mode", ownership_choices,
-                                    "A", str)
-        add_input("Servers",
-                  "What is the ownership mode?", i_ownership_mode)
-
-        # client access behavior
-        access_rates = [ (1, "one file per day"),
-                         (10, "10 files per day"),
-                         (100, "100 files per day"),
-                         (1000, "1k files per day"),
-                         (10e3, "10k files per day"),
-                         (100e3, "100k files per day"),
-                         ]
-        download_files_per_day, i_download_rate = \
-                                get_and_set("download_rate", access_rates,
-                                            100, int)
-        add_input("Users",
-                  "How many files are downloaded per day?", i_download_rate)
-        download_rate = 1.0 * download_files_per_day / (24*60*60)
-
-        upload_files_per_day, i_upload_rate = \
-                              get_and_set("upload_rate", access_rates,
-                                          10, int)
-        add_input("Users",
-                  "How many files are uploaded per day?", i_upload_rate)
-        upload_rate = 1.0 * upload_files_per_day / (24*60*60)
-
-        delete_files_per_day, i_delete_rate = \
-                              get_and_set("delete_rate", access_rates,
-                                          10, int)
-        add_input("Users",
-                  "How many files are deleted per day?", i_delete_rate)
-        delete_rate = 1.0 * delete_files_per_day / (24*60*60)
-
-
-        # the value is in days
-        lease_timers = [ (1, "one refresh per day"),
-                         (7, "one refresh per week"),
-                         ]
-        lease_timer, i_lease = \
-                     get_and_set("lease_timer", lease_timers,
-                                 7, int)
-        add_input("Users",
-                  "How frequently do clients refresh files or accounts? "
-                  "(if necessary)",
-                  i_lease)
-        seconds_per_lease = 24*60*60*lease_timer
-
-        check_timer_choices = [ (1, "every week"),
-                                (4, "every month"),
-                                (8, "every two months"),
-                                (16, "every four months"),
-                                ]
-        check_timer, i_check_timer = \
-                     get_and_set("check_timer", check_timer_choices, 4, int)
-        add_input("Users",
-                  "How frequently should we check on each file?",
-                  i_check_timer)
-        file_check_interval = check_timer * 7 * 24 * 3600
-
-
-        if filled:
-            add_output("Users", T.div["Total users: %s" % number(num_users)])
-            add_output("Users",
-                       T.div["Files per user: %s" % number(files_per_user)])
-            file_size = 1.0 * space_per_user / files_per_user
-            add_output("Users",
-                       T.div["Average file size: ", number(file_size)])
-            total_files = num_users * files_per_user / sharing_ratio
-
-            add_output("Grid",
-                       T.div["Total number of files in grid: ",
-                             number(total_files)])
-            total_space = num_users * space_per_user / sharing_ratio
-            add_output("Grid",
-                       T.div["Total volume of plaintext in grid: ",
-                             number(total_space, "B")])
-
-            total_shares = n * total_files
-            add_output("Grid",
-                       T.div["Total shares in grid: ", number(total_shares)])
-            expansion = float(n) / float(k)
-
-            total_usage = expansion * total_space
-            add_output("Grid",
-                       T.div["Share data in grid: ", number(total_usage, "B")])
-
-            if n > num_servers:
-                # silly configuration, causes Tahoe2 to wrap and put multiple
-                # shares on some servers.
-                add_output("Servers",
-                           T.div["non-ideal: more shares than servers"
-                                 " (n=%d, servers=%d)" % (n, num_servers)])
-                # every file has at least one share on every server
-                buckets_per_server = total_files
-                shares_per_server = total_files * ((1.0 * n) / num_servers)
-            else:
-                # if nobody is full, then no lease requests will be turned
-                # down for lack of space, and no two shares for the same file
-                # will share a server. Therefore the chance that any given
-                # file has a share on any given server is n/num_servers.
-                buckets_per_server = total_files * ((1.0 * n) / num_servers)
-                # since each such represented file only puts one share on a
-                # server, the total number of shares per server is the same.
-                shares_per_server = buckets_per_server
-            add_output("Servers",
-                       T.div["Buckets per server: ",
-                             number(buckets_per_server)])
-            add_output("Servers",
-                       T.div["Shares per server: ",
-                             number(shares_per_server)])
-
-            # how much space is used on the storage servers for the shares?
-            #  the share data itself
-            share_data_per_server = total_usage / num_servers
-            add_output("Servers",
-                       T.div["Share data per server: ",
-                             number(share_data_per_server, "B")])
-            # this is determined empirically. H=hashsize=32, for a one-segment
-            # file and 3-of-10 encoding
-            share_validation_per_server = 266 * shares_per_server
-            # this could be 423*buckets_per_server, if we moved the URI
-            # extension into a separate file, but that would actually consume
-            # *more* space (minimum filesize is 4KiB), unless we moved all
-            # shares for a given bucket into a single file.
-            share_uri_extension_per_server = 423 * shares_per_server
-
-            # ownership mode adds per-bucket data
-            H = 32 # depends upon the desired security of delete/refresh caps
-            # bucket_lease_size is the amount of data needed to keep track of
-            # the delete/refresh caps for each bucket.
-            bucket_lease_size = 0
-            client_bucket_refresh_rate = 0
-            owner_table_size = 0
-            if ownership_mode in ("B", "C", "D", "E"):
-                bucket_lease_size = sharing_ratio * 1.0 * H
-            if ownership_mode in ("B", "C"):
-                # refreshes per second per client
-                client_bucket_refresh_rate = (1.0 * n * files_per_user /
-                                              seconds_per_lease)
-                add_output("Users",
-                           T.div["Client share refresh rate (outbound): ",
-                                 number(client_bucket_refresh_rate, "Hz")])
-                server_bucket_refresh_rate = (client_bucket_refresh_rate *
-                                              num_users / num_servers)
-                add_output("Servers",
-                           T.div["Server share refresh rate (inbound): ",
-                                 number(server_bucket_refresh_rate, "Hz")])
-            if ownership_mode in ("D", "E"):
-                # each server must maintain a bidirectional mapping from
-                # buckets to owners. One way to implement this would be to
-                # put a list of four-byte owner numbers into each bucket, and
-                # a list of four-byte share numbers into each owner (although
-                # of course we'd really just throw it into a database and let
-                # the experts take care of the details).
-                owner_table_size = 2*(buckets_per_server * sharing_ratio * 4)
-
-            if ownership_mode in ("E",):
-                # in this mode, clients must refresh one timer per server
-                client_account_refresh_rate = (1.0 * num_servers /
-                                               seconds_per_lease)
-                add_output("Users",
-                           T.div["Client account refresh rate (outbound): ",
-                                 number(client_account_refresh_rate, "Hz")])
-                server_account_refresh_rate = (client_account_refresh_rate *
-                                              num_users / num_servers)
-                add_output("Servers",
-                           T.div["Server account refresh rate (inbound): ",
-                                 number(server_account_refresh_rate, "Hz")])
-
-            # TODO: buckets vs shares here is a bit wonky, but in
-            # non-wrapping grids it shouldn't matter
-            share_lease_per_server = bucket_lease_size * buckets_per_server
-            share_ownertable_per_server = owner_table_size
-
-            share_space_per_server = (share_data_per_server +
-                                      share_validation_per_server +
-                                      share_uri_extension_per_server +
-                                      share_lease_per_server +
-                                      share_ownertable_per_server)
-            add_output("Servers",
-                       T.div["Share space per server: ",
-                             number(share_space_per_server, "B"),
-                             " (data ",
-                             number(share_data_per_server, "B"),
-                             ", validation ",
-                             number(share_validation_per_server, "B"),
-                             ", UEB ",
-                             number(share_uri_extension_per_server, "B"),
-                             ", lease ",
-                             number(share_lease_per_server, "B"),
-                             ", ownertable ",
-                             number(share_ownertable_per_server, "B"),
-                             ")",
-                             ])
-
-
-            # rates
-            client_download_share_rate = download_rate * k
-            client_download_byte_rate = download_rate * file_size
-            add_output("Users",
-                       T.div["download rate: shares = ",
-                             number(client_download_share_rate, "Hz"),
-                             " , bytes = ",
-                             number(client_download_byte_rate, "Bps"),
-                             ])
-            total_file_check_rate = 1.0 * total_files / file_check_interval
-            client_check_share_rate = total_file_check_rate / num_users
-            add_output("Users",
-                       T.div["file check rate: shares = ",
-                             number(client_check_share_rate, "Hz"),
-                             " (interval = %s)" %
-                             number(1 / client_check_share_rate, "s"),
-                             ])
-
-            client_upload_share_rate = upload_rate * n
-            # TODO: doesn't include overhead
-            client_upload_byte_rate = upload_rate * file_size * expansion
-            add_output("Users",
-                       T.div["upload rate: shares = ",
-                             number(client_upload_share_rate, "Hz"),
-                             " , bytes = ",
-                             number(client_upload_byte_rate, "Bps"),
-                             ])
-            client_delete_share_rate = delete_rate * n
-
-            server_inbound_share_rate = (client_upload_share_rate *
-                                         num_users / num_servers)
-            server_inbound_byte_rate = (client_upload_byte_rate *
-                                        num_users / num_servers)
-            add_output("Servers",
-                       T.div["upload rate (inbound): shares = ",
-                             number(server_inbound_share_rate, "Hz"),
-                             " , bytes = ",
-                              number(server_inbound_byte_rate, "Bps"),
-                             ])
-            add_output("Servers",
-                       T.div["share check rate (inbound): ",
-                             number(total_file_check_rate * n / num_servers,
-                                    "Hz"),
-                             ])
-
-            server_share_modify_rate = ((client_upload_share_rate +
-                                         client_delete_share_rate) *
-                                         num_users / num_servers)
-            add_output("Servers",
-                       T.div["share modify rate: shares = ",
-                             number(server_share_modify_rate, "Hz"),
-                             ])
-
-            server_outbound_share_rate = (client_download_share_rate *
-                                          num_users / num_servers)
-            server_outbound_byte_rate = (client_download_byte_rate *
-                                         num_users / num_servers)
-            add_output("Servers",
-                       T.div["download rate (outbound): shares = ",
-                             number(server_outbound_share_rate, "Hz"),
-                             " , bytes = ",
-                              number(server_outbound_byte_rate, "Bps"),
-                             ])
-
-
-            total_share_space = num_servers * share_space_per_server
-            add_output("Grid",
-                       T.div["Share space consumed: ",
-                             number(total_share_space, "B")])
-            add_output("Grid",
-                       T.div[" %% validation: %.2f%%" %
-                             (100.0 * share_validation_per_server /
-                              share_space_per_server)])
-            add_output("Grid",
-                       T.div[" %% uri-extension: %.2f%%" %
-                             (100.0 * share_uri_extension_per_server /
-                              share_space_per_server)])
-            add_output("Grid",
-                       T.div[" %% lease data: %.2f%%" %
-                             (100.0 * share_lease_per_server /
-                              share_space_per_server)])
-            add_output("Grid",
-                       T.div[" %% owner data: %.2f%%" %
-                             (100.0 * share_ownertable_per_server /
-                              share_space_per_server)])
-            add_output("Grid",
-                       T.div[" %% share data: %.2f%%" %
-                             (100.0 * share_data_per_server /
-                              share_space_per_server)])
-            add_output("Grid",
-                       T.div["file check rate: ",
-                             number(total_file_check_rate,
-                                    "Hz")])
-
-            total_drives = max(mathutil.div_ceil(int(total_share_space),
-                                                 int(drive_size)),
-                               num_servers)
-            add_output("Drives",
-                       T.div["Total drives: ", number(total_drives), " drives"])
-            drives_per_server = mathutil.div_ceil(total_drives, num_servers)
-            add_output("Servers",
-                       T.div["Drives per server: ", drives_per_server])
-
-            # costs
-            if drive_size == 3000 * 1e9:
-                add_output("Servers", T.div["3000GB drive: $250 each"])
-                drive_cost = 250
-            else:
-                add_output("Servers",
-                           T.div[T.b["unknown cost per drive, assuming $100"]])
-                drive_cost = 100
-
-            if drives_per_server <= 4:
-                add_output("Servers", T.div["1U box with <= 4 drives: $1500"])
-                server_cost = 1500 # typical 1U box
-            elif drives_per_server <= 12:
-                add_output("Servers", T.div["2U box with <= 12 drives: $2500"])
-                server_cost = 2500 # 2U box
-            else:
-                add_output("Servers",
-                           T.div[T.b["Note: too many drives per server, "
-                                     "assuming $3000"]])
-                server_cost = 3000
-
-            server_capital_cost = (server_cost + drives_per_server * drive_cost)
-            total_server_cost = float(num_servers * server_capital_cost)
-            add_output("Servers", T.div["Capital cost per server: $",
-                                        server_capital_cost])
-            add_output("Grid", T.div["Capital cost for all servers: $",
-                                     number(total_server_cost)])
-            # $70/Mbps/mo
-            # $44/server/mo power+space
-            server_bandwidth = max(server_inbound_byte_rate,
-                                   server_outbound_byte_rate)
-            server_bandwidth_mbps = mathutil.div_ceil(int(server_bandwidth*8),
-                                                      int(1e6))
-            server_monthly_cost = 70*server_bandwidth_mbps + 44
-            add_output("Servers", T.div["Monthly cost per server: $",
-                                        server_monthly_cost])
-            add_output("Users", T.div["Capital cost per user: $",
-                                      number(total_server_cost / num_users)])
-
-            # reliability
-            any_drive_failure_rate = total_drives * drive_failure_rate
-            any_drive_MTBF = 1 // any_drive_failure_rate  # in seconds
-            any_drive_MTBF_days = any_drive_MTBF / 86400
-            add_output("Drives",
-                       T.div["MTBF (any drive): ",
-                             number(any_drive_MTBF_days), " days"])
-            drive_replacement_monthly_cost = (float(drive_cost)
-                                              * any_drive_failure_rate
-                                              *30*86400)
-            add_output("Grid",
-                       T.div["Monthly cost of replacing drives: $",
-                             number(drive_replacement_monthly_cost)])
-
-            total_server_monthly_cost = float(num_servers * server_monthly_cost
-                                              + drive_replacement_monthly_cost)
-
-            add_output("Grid", T.div["Monthly cost for all servers: $",
-                                     number(total_server_monthly_cost)])
-            add_output("Users",
-                       T.div["Monthly cost per user: $",
-                             number(total_server_monthly_cost / num_users)])
-
-            # availability
-            file_dBA = self.file_availability(k, n, server_dBA)
-            user_files_dBA = self.many_files_availability(file_dBA,
-                                                          files_per_user)
-            all_files_dBA = self.many_files_availability(file_dBA, total_files)
-            add_output("Users",
-                       T.div["availability of: ",
-                             "arbitrary file = %d dBA, " % file_dBA,
-                             "all files of user1 = %d dBA, " % user_files_dBA,
-                             "all files in grid = %d dBA" % all_files_dBA,
-                             ],
-                       )
-
-            time_until_files_lost = (n-k+1) / any_drive_failure_rate
-            add_output("Grid",
-                       T.div["avg time until files are lost: ",
-                             number(time_until_files_lost, "s"), ", ",
-                             number(time_until_files_lost/86400, " days"),
-                             ])
-
-            share_data_loss_rate = any_drive_failure_rate * drive_size
-            add_output("Grid",
-                       T.div["share data loss rate: ",
-                             number(share_data_loss_rate,"Bps")])
-
-            # the worst-case survival numbers occur when we do a file check
-            # and the file is just above the threshold for repair (so we
-            # decide to not repair it). The question is then: what is the
-            # chance that the file will decay so badly before the next check
-            # that we can't recover it? The resulting probability is per
-            # check interval.
-            # Note that the chances of us getting into this situation are low.
-            P_disk_failure_during_interval = (drive_failure_rate *
-                                              file_check_interval)
-            disk_failure_dBF = 10*math.log10(P_disk_failure_during_interval)
-            disk_failure_dBA = -disk_failure_dBF
-            file_survives_dBA = self.file_availability(k, repair_threshold,
-                                                       disk_failure_dBA)
-            user_files_survives_dBA = self.many_files_availability( \
-                file_survives_dBA, files_per_user)
-            all_files_survives_dBA = self.many_files_availability( \
-                file_survives_dBA, total_files)
-            add_output("Users",
-                       T.div["survival of: ",
-                             "arbitrary file = %d dBA, " % file_survives_dBA,
-                             "all files of user1 = %d dBA, " %
-                             user_files_survives_dBA,
-                             "all files in grid = %d dBA" %
-                             all_files_survives_dBA,
-                             " (per worst-case check interval)",
-                             ])
-
-
-
-        all_sections = []
-        all_sections.append(build_section("Users"))
-        all_sections.append(build_section("Servers"))
-        all_sections.append(build_section("Drives"))
-        if "Grid" in sections:
-            all_sections.append(build_section("Grid"))
-
-        f = T.form(action=".", method="post", enctype="multipart/form-data")
-
-        if filled:
-            action = "Recompute"
-        else:
-            action = "Compute"
-
-        f = f[T.input(type="hidden", name="filled", value="true"),
-              T.input(type="submit", value=action),
-              all_sections,
-              ]
-
-        try:
-            from allmydata import reliability
-            # we import this just to test to see if the page is available
-            _hush_pyflakes = reliability
-            del _hush_pyflakes
-            f = [T.div[T.a(href="../reliability")["Reliability Math"]], f]
-        except ImportError:
-            pass
-
-        return f
-
-    def file_availability(self, k, n, server_dBA):
-        """
-        The full formula for the availability of a specific file is::
-
-         1 - sum([choose(N,i) * p**i * (1-p)**(N-i)] for i in range(k)])
-
-        Where choose(N,i) = N! / ( i! * (N-i)! ) . Note that each term of
-        this summation is the probability that there are exactly 'i' servers
-        available, and what we're doing is adding up the cases where i is too
-        low.
-
-        This is a nuisance to calculate at all accurately, especially once N
-        gets large, and when p is close to unity. So we make an engineering
-        approximation: if (1-p) is very small, then each [i] term is much
-        larger than the [i-1] term, and the sum is dominated by the i=k-1
-        term. This only works for (1-p) < 10%, and when the choose() function
-        doesn't rise fast enough to compensate. For high-expansion encodings
-        (3-of-10, 25-of-100), the choose() function is rising at the same
-        time as the (1-p)**(N-i) term, so that's not an issue. For
-        low-expansion encodings (7-of-10, 75-of-100) the two values are
-        moving in opposite directions, so more care must be taken.
-
-        Note that the p**i term has only a minor effect as long as (1-p)*N is
-        small, and even then the effect is attenuated by the 1-p term.
-        """
-
-        assert server_dBA > 9  # >=90% availability to use the approximation
-        factor = binomial(n, k-1)
-        factor_dBA = 10 * math.log10(factor)
-        exponent = n - k + 1
-        file_dBA = server_dBA * exponent - factor_dBA
-        return file_dBA
-
-    def many_files_availability(self, file_dBA, num_files):
-        """The probability that 'num_files' independent bernoulli trials will
-        succeed (i.e. we can recover all files in the grid at any given
-        moment) is p**num_files . Since p is close to unity, we express in p
-        in dBA instead, so we can get useful precision on q (=1-p), and then
-        the formula becomes::
-
-         P_some_files_unavailable = 1 - (1 - q)**num_files
-
-        That (1-q)**n expands with the usual binomial sequence, 1 - nq +
-        Xq**2 ... + Xq**n . We use the same approximation as before, since we
-        know q is close to zero, and we get to ignore all the terms past -nq.
-        """
-
-        many_files_dBA = file_dBA - 10 * math.log10(num_files)
-        return many_files_dBA
diff --git a/src/allmydata/reliability.py b/src/allmydata/reliability.py

deleted file mode 100644 (file)

index a0d6076..0000000
--- a/src/allmydata/reliability.py
+++ /dev/null
@@ -1,251 +0,0 @@
-#! /usr/bin/python
-
-import math
-from allmydata.util import statistics
-from numpy import array, matrix, dot
-
-DAY=24*60*60
-MONTH=31*DAY
-YEAR=365*DAY
-
-class ReliabilityModel:
-    """Generate a model of system-wide reliability, given several input
-    parameters.
-
-    This runs a simulation in which time is quantized down to 'delta' seconds
-    (default is one month): a smaller delta will result in a more accurate
-    simulation, but will take longer to run. 'report_span' simulated seconds
-    will be run.
-
-    The encoding parameters are provided as 'k' (minimum number of shares
-    needed to recover the file) and 'N' (total number of shares generated).
-    The default parameters are 3-of-10.
-
-    The first step is to build a probability of individual drive loss during
-    any given delta. This uses a simple exponential model, in which the
-    average drive lifetime is specified by the 'drive_lifetime' parameter
-    (default is 8 years).
-
-    The second step is to calculate a 'transition matrix': a table of
-    probabilities that shows, given A shares at the start of the delta, what
-    the chances are of having B shares left at the end of the delta. The
-    current code optimistically assumes all drives are independent. A
-    subclass could override that assumption.
-
-    An additional 'repair matrix' is created to show what happens when the
-    Checker/Repairer is run. In the simulation, the Checker will be run every
-    'check_period' seconds (default is one month), and the Repairer will be
-    run if it sees fewer than 'R' shares (default 7).
-
-    The third step is to finally run the simulation. An initial probability
-    vector is created (with a 100% chance of N shares and a 0% chance of
-    fewer than N shares), then it is multiplied by the transition matrix for
-    every delta of time. Each time the Checker is to be run, the repair
-    matrix is multiplied in, and some additional stats are accumulated
-    (average number of repairs that occur, average number of shares
-    regenerated per repair).
-
-    The output is a ReliabilityReport instance, which contains a table that
-    samples the state of the simulation once each 'report_period' seconds
-    (defaults to 3 months). Each row of this table will contain the
-    probability vector for one sample period (chance of having X shares, from
-    0 to N, at the end of the period). The report will also contain other
-    information.
-
-    """
-
-    @classmethod
-    def run(klass,
-            drive_lifetime=8*YEAR,
-            k=3, R=7, N=10,
-            delta=1*MONTH,
-            check_period=1*MONTH,
-            report_period=3*MONTH,
-            report_span=5*YEAR,
-            ):
-        self = klass()
-
-        check_period = check_period-1
-        P = self.p_in_period(drive_lifetime, delta)
-
-        decay = self.build_decay_matrix(N, P)
-
-        repair = self.build_repair_matrix(k, N, R)
-
-        #print "DECAY:", decay
-        #print "OLD-POST-REPAIR:", old_post_repair
-        #print "NEW-POST-REPAIR:", decay * repair
-        #print "REPAIR:", repair
-        #print "DIFF:", (old_post_repair - decay * repair)
-
-        START = array([0]*N + [1])
-        DEAD = array([1]*k + [0]*(1+N-k))
-        REPAIRp = array([0]*k + [1]*(R-k) + [0]*(1+N-R))
-        REPAIR_newshares = array([0]*k +
-                                 [N-i for i in range(k, R)] +
-                                 [0]*(1+N-R))
-        assert REPAIR_newshares.shape[0] == N+1
-        #print "START", START
-        #print "REPAIRp", REPAIRp
-        #print "REPAIR_newshares", REPAIR_newshares
-
-        unmaintained_state = START
-        maintained_state = START
-        last_check = 0
-        last_report = 0
-        P_repaired_last_check_period = 0.0
-        needed_repairs = []
-        needed_new_shares = []
-        report = ReliabilityReport()
-
-        for t in range(0, report_span+delta, delta):
-            # the .A[0] turns the one-row matrix back into an array
-            unmaintained_state = (unmaintained_state * decay).A[0]
-            maintained_state = (maintained_state * decay).A[0]
-            if (t-last_check) > check_period:
-                last_check = t
-                # we do a check-and-repair this frequently
-                need_repair = dot(maintained_state, REPAIRp)
-
-                P_repaired_last_check_period = need_repair
-                new_shares = dot(maintained_state, REPAIR_newshares)
-                needed_repairs.append(need_repair)
-                needed_new_shares.append(new_shares)
-
-                maintained_state = (maintained_state * repair).A[0]
-
-            if (t-last_report) > report_period:
-                last_report = t
-                P_dead_unmaintained = dot(unmaintained_state, DEAD)
-                P_dead_maintained = dot(maintained_state, DEAD)
-                cumulative_number_of_repairs = sum(needed_repairs)
-                cumulative_number_of_new_shares = sum(needed_new_shares)
-                report.add_sample(t, unmaintained_state, maintained_state,
-                                  P_repaired_last_check_period,
-                                  cumulative_number_of_repairs,
-                                  cumulative_number_of_new_shares,
-                                  P_dead_unmaintained, P_dead_maintained)
-
-        # record one more sample at the end of the run
-        P_dead_unmaintained = dot(unmaintained_state, DEAD)
-        P_dead_maintained = dot(maintained_state, DEAD)
-        cumulative_number_of_repairs = sum(needed_repairs)
-        cumulative_number_of_new_shares = sum(needed_new_shares)
-        report.add_sample(t, unmaintained_state, maintained_state,
-                          P_repaired_last_check_period,
-                          cumulative_number_of_repairs,
-                          cumulative_number_of_new_shares,
-                          P_dead_unmaintained, P_dead_maintained)
-
-        #def yandm(seconds):
-        #    return "%dy.%dm" % (int(seconds/YEAR), int( (seconds%YEAR)/MONTH))
-        #needed_repairs_total = sum(needed_repairs)
-        #needed_new_shares_total = sum(needed_new_shares)
-        #print "at 2y:"
-        #print " unmaintained", unmaintained_state
-        #print " maintained", maintained_state
-        #print " number of repairs", needed_repairs_total
-        #print " new shares generated", needed_new_shares_total
-        #repair_rate_inv = report_span / needed_repairs_total
-        #print "  avg repair rate: once every %s" % yandm(repair_rate_inv)
-        #print "  avg repair download: one share every %s" % yandm(repair_rate_inv/k)
-        #print "  avg repair upload: one share every %s" % yandm(report_span / needed_new_shares_total)
-
-        return report
-
-    def p_in_period(self, avg_lifetime, period):
-        """Given an average lifetime of a disk (using an exponential model),
-        what is the chance that a live disk will survive the next 'period'
-        seconds?"""
-
-        # eg p_in_period(8*YEAR, MONTH) = 98.94%
-        return math.exp(-1.0*period/avg_lifetime)
-
-    def build_decay_matrix(self, N, P):
-        """Return a decay matrix. decay[start_shares][end_shares] is the
-        conditional probability of finishing with end_shares, given that we
-        started with start_shares."""
-        decay_rows = []
-        decay_rows.append( [0.0]*(N+1) )
-        for start_shares in range(1, (N+1)):
-            end_shares = self.build_decay_row(start_shares, P)
-            decay_row = end_shares + [0.0] * (N-start_shares)
-            assert len(decay_row) == (N+1), len(decay_row)
-            decay_rows.append(decay_row)
-
-        decay = matrix(decay_rows)
-        return decay
-
-    def build_decay_row(self, start_shares, P):
-        """Return a decay row 'end_shares'. end_shares[i] is the chance that
-        we finish with i shares, given that we started with start_shares, for
-        all i between 0 and start_shares, inclusive. This implementation
-        assumes that all shares are independent (IID), but a more complex
-        model could incorporate inter-share failure correlations like having
-        two shares on the same server."""
-        end_shares = statistics.binomial_distribution_pmf(start_shares, P)
-        return end_shares
-
-    def build_repair_matrix(self, k, N, R):
-        """Return a repair matrix. repair[start][end]: is the conditional
-        probability of the repairer finishing with 'end' shares, given that
-        it began with 'start' shares (repair if fewer than R shares). The
-        repairer's behavior is deterministic, so all values in this matrix
-        are either 0 or 1. This matrix should be applied *after* the decay
-        matrix."""
-        new_repair_rows = []
-        for start_shares in range(0, N+1):
-            new_repair_row = [0] * (N+1)
-            if start_shares < k:
-                new_repair_row[start_shares] = 1
-            elif start_shares < R:
-                new_repair_row[N] = 1
-            else:
-                new_repair_row[start_shares] = 1
-            new_repair_rows.append(new_repair_row)
-
-        repair = matrix(new_repair_rows)
-        return repair
-
-class ReliabilityReport:
-    def __init__(self):
-        self.samples = []
-
-    def add_sample(self, when, unmaintained_shareprobs, maintained_shareprobs,
-                   P_repaired_last_check_period,
-                   cumulative_number_of_repairs,
-                   cumulative_number_of_new_shares,
-                   P_dead_unmaintained, P_dead_maintained):
-        """
-        when: the timestamp at the end of the report period
-        unmaintained_shareprobs: a vector of probabilities, element[S]
-                                 is the chance that there are S shares
-                                 left at the end of the report period.
-                                 This tracks what happens if no repair
-                                 is ever done.
-        maintained_shareprobs: same, but for 'maintained' grids, where
-                               check and repair is done at the end
-                               of each check period
-        P_repaired_last_check_period: a float, with the probability
-                                      that a repair was performed
-                                      at the end of the most recent
-                                      check period.
-        cumulative_number_of_repairs: a float, with the average number
-                                      of repairs that will have been
-                                      performed by the end of the
-                                      report period
-        cumulative_number_of_new_shares: a float, with the average number
-                                         of new shares that repair proceses
-                                         generated by the end of the report
-                                         period
-        P_dead_unmaintained: a float, with the chance that the file will
-                             be unrecoverable at the end of the period
-        P_dead_maintained: same, but for maintained grids
-
-        """
-        row = (when, unmaintained_shareprobs, maintained_shareprobs,
-               P_repaired_last_check_period,
-               cumulative_number_of_repairs,
-               cumulative_number_of_new_shares,
-               P_dead_unmaintained, P_dead_maintained)
-        self.samples.append(row)
diff --git a/src/allmydata/test/test_provisioning.py b/src/allmydata/test/test_provisioning.py

deleted file mode 100644 (file)

index 71bc657..0000000
--- a/src/allmydata/test/test_provisioning.py
+++ /dev/null
@@ -1,113 +0,0 @@
-
-from twisted.trial import unittest
-from allmydata import provisioning
-ReliabilityModel = None
-try:
-    from allmydata.reliability import ReliabilityModel
-except ImportError:
-    pass # might not be importable, since it needs NumPy
-
-from nevow import inevow
-from zope.interface import implements
-
-class MyRequest:
-    implements(inevow.IRequest)
-    pass
-
-class Provisioning(unittest.TestCase):
-    def getarg(self, name, astype=int):
-        if name in self.fields:
-            return astype(self.fields[name])
-        return None
-
-    def test_load(self):
-        pt = provisioning.ProvisioningTool()
-        self.fields = {}
-        #r = MyRequest()
-        #r.fields = self.fields
-        #ctx = RequestContext()
-        #unfilled = pt.renderSynchronously(ctx)
-        lots_of_stan = pt.do_forms(self.getarg)
-        self.failUnless(lots_of_stan is not None)
-
-        self.fields = {'filled': True,
-                       "num_users": 50e3,
-                       "files_per_user": 1000,
-                       "space_per_user": 1e9,
-                       "sharing_ratio": 1.0,
-                       "encoding_parameters": "3-of-10-5",
-                       "num_servers": 30,
-                       "ownership_mode": "A",
-                       "download_rate": 100,
-                       "upload_rate": 10,
-                       "delete_rate": 10,
-                       "lease_timer": 7,
-                       }
-        #filled = pt.renderSynchronously(ctx)
-        more_stan = pt.do_forms(self.getarg)
-        self.failUnless(more_stan is not None)
-
-        # trigger the wraparound configuration
-        self.fields["num_servers"] = 5
-        #filled = pt.renderSynchronously(ctx)
-        more_stan = pt.do_forms(self.getarg)
-
-        # and other ownership modes
-        self.fields["ownership_mode"] = "B"
-        more_stan = pt.do_forms(self.getarg)
-        self.fields["ownership_mode"] = "E"
-        more_stan = pt.do_forms(self.getarg)
-
-    def test_provisioning_math(self):
-        self.failUnlessEqual(provisioning.binomial(10, 0), 1)
-        self.failUnlessEqual(provisioning.binomial(10, 1), 10)
-        self.failUnlessEqual(provisioning.binomial(10, 2), 45)
-        self.failUnlessEqual(provisioning.binomial(10, 9), 10)
-        self.failUnlessEqual(provisioning.binomial(10, 10), 1)
-
-DAY=24*60*60
-MONTH=31*DAY
-YEAR=365*DAY
-
-class Reliability(unittest.TestCase):
-    def test_basic(self):
-        if ReliabilityModel is None:
-            raise unittest.SkipTest("reliability model requires NumPy")
-
-        # test that numpy math works the way I think it does
-        import numpy
-        decay = numpy.matrix([[1,0,0],
-                             [.1,.9,0],
-                             [.01,.09,.9],
-                             ])
-        start = numpy.array([0,0,1])
-        g2 = (start * decay).A[0]
-        self.failUnlessEqual(repr(g2), repr(numpy.array([.01,.09,.9])))
-        g3 = (g2 * decay).A[0]
-        self.failUnlessEqual(repr(g3), repr(numpy.array([.028,.162,.81])))
-
-        # and the dot product
-        recoverable = numpy.array([0,1,1])
-        P_recoverable_g2 = numpy.dot(g2, recoverable)
-        self.failUnlessAlmostEqual(P_recoverable_g2, .9 + .09)
-        P_recoverable_g3 = numpy.dot(g3, recoverable)
-        self.failUnlessAlmostEqual(P_recoverable_g3, .81 + .162)
-
-        r = ReliabilityModel.run(delta=100000,
-                                 report_period=3*MONTH,
-                                 report_span=5*YEAR)
-        self.failUnlessEqual(len(r.samples), 20)
-
-        last_row = r.samples[-1]
-        #print last_row
-        (when, unmaintained_shareprobs, maintained_shareprobs,
-         P_repaired_last_check_period,
-         cumulative_number_of_repairs,
-         cumulative_number_of_new_shares,
-         P_dead_unmaintained, P_dead_maintained) = last_row
-        self.failUnless(isinstance(P_repaired_last_check_period, float))
-        self.failUnless(isinstance(P_dead_unmaintained, float))
-        self.failUnless(isinstance(P_dead_maintained, float))
-        self.failUnlessAlmostEqual(P_dead_unmaintained, 0.033591004555395272)
-        self.failUnlessAlmostEqual(P_dead_maintained, 3.2983995819177542e-08)
-
diff --git a/src/allmydata/test/test_web.py b/src/allmydata/test/test_web.py

index d918bc15412a0cdb972a10061537d40012546f96..edafd24250f1279f464e9e420b614b98c73560a5 100644 (file)
--- a/src/allmydata/test/test_web.py
+++ b/src/allmydata/test/test_web.py
@@ -512,90 +512,6 @@ class Web(WebMixin, WebErrorMixin, testutil.StallMixin, testutil.ReallyEqualMixi
          d.addCallback(_check)
          return d
  
          d.addCallback(_check)
          return d
  
-    def test_provisioning(self):
-        d = self.GET("/provisioning/")
-        def _check(res):
-            self.failUnlessIn('Provisioning Tool', res)
-            self.failUnlessIn(FAVICON_MARKUP, res)
-
-            fields = {'filled': True,
-                      "num_users": int(50e3),
-                      "files_per_user": 1000,
-                      "space_per_user": int(1e9),
-                      "sharing_ratio": 1.0,
-                      "encoding_parameters": "3-of-10-5",
-                      "num_servers": 30,
-                      "ownership_mode": "A",
-                      "download_rate": 100,
-                      "upload_rate": 10,
-                      "delete_rate": 10,
-                      "lease_timer": 7,
-                      }
-            return self.POST("/provisioning/", **fields)
-
-        d.addCallback(_check)
-        def _check2(res):
-            self.failUnlessIn('Provisioning Tool', res)
-            self.failUnlessIn(FAVICON_MARKUP, res)
-            self.failUnlessIn("Share space consumed: 167.01TB", res)
-
-            fields = {'filled': True,
-                      "num_users": int(50e6),
-                      "files_per_user": 1000,
-                      "space_per_user": int(5e9),
-                      "sharing_ratio": 1.0,
-                      "encoding_parameters": "25-of-100-50",
-                      "num_servers": 30000,
-                      "ownership_mode": "E",
-                      "drive_failure_model": "U",
-                      "drive_size": 1000,
-                      "download_rate": 1000,
-                      "upload_rate": 100,
-                      "delete_rate": 100,
-                      "lease_timer": 7,
-                      }
-            return self.POST("/provisioning/", **fields)
-        d.addCallback(_check2)
-        def _check3(res):
-            self.failUnlessIn("Share space consumed: huge!", res)
-            fields = {'filled': True}
-            return self.POST("/provisioning/", **fields)
-        d.addCallback(_check3)
-        def _check4(res):
-            self.failUnlessIn("Share space consumed:", res)
-        d.addCallback(_check4)
-        return d
-
-    def test_reliability_tool(self):
-        try:
-            from allmydata import reliability
-            _hush_pyflakes = reliability
-            del _hush_pyflakes
-        except:
-            raise unittest.SkipTest("reliability tool requires NumPy")
-
-        d = self.GET("/reliability/")
-        def _check(res):
-            self.failUnlessIn('Reliability Tool', res)
-            fields = {'drive_lifetime': "8Y",
-                      "k": "3",
-                      "R": "7",
-                      "N": "10",
-                      "delta": "100000",
-                      "check_period": "1M",
-                      "report_period": "3M",
-                      "report_span": "5Y",
-                      }
-            return self.POST("/reliability/", **fields)
-
-        d.addCallback(_check)
-        def _check2(res):
-            self.failUnlessIn('Reliability Tool', res)
-            r = r'Probability of loss \(no maintenance\):\s+<span>0.033591'
-            self.failUnless(re.search(r, res), res)
-        d.addCallback(_check2)
-        return d
-
      def test_status(self):
          h = self.s.get_history()
          dl_num = h.list_all_download_statuses()[0].get_counter()
      def test_status(self):
          h = self.s.get_history()
          dl_num = h.list_all_download_statuses()[0].get_counter()
diff --git a/src/allmydata/web/provisioning.xhtml b/src/allmydata/web/provisioning.xhtml

deleted file mode 100644 (file)

index bfa4edb..0000000
--- a/src/allmydata/web/provisioning.xhtml
+++ /dev/null
@@ -1,18 +0,0 @@
-<html xmlns:n="http://nevow.com/ns/nevow/0.1">
-  <head>
-    <title>Tahoe-LAFS - Provisioning Tool</title>
-    <link href="/tahoe.css" rel="stylesheet" type="text/css"/>
-    <link href="/icon.png" rel="shortcut icon" />
-    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-  </head>
-  <body>
-
-<h1>Tahoe-LAFS Provisioning Tool</h1>
-
-<p>This page will help you determine how much disk space and network
-bandwidth will be required by various sizes and types of Tahoe-LAFS networks.</p>
-
-<div n:render="forms" />
-
-  </body>
-</html>
diff --git a/src/allmydata/web/reliability.py b/src/allmydata/web/reliability.py

deleted file mode 100644 (file)

index d5d3406..0000000
--- a/src/allmydata/web/reliability.py
+++ /dev/null
@@ -1,152 +0,0 @@
-
-from nevow import rend, tags as T
-reliability = None # might not be usable
-try:
-    from allmydata import reliability # requires NumPy
-except ImportError:
-    pass
-from allmydata.web.common import getxmlfile, get_arg
-
-
-DAY=24*60*60
-MONTH=31*DAY
-YEAR=365*DAY
-
-def is_available():
-    if reliability:
-        return True
-    return False
-
-def yandm(seconds):
-    return "%dy.%dm" % (int(seconds/YEAR), int( (seconds%YEAR)/MONTH))
-
-class ReliabilityTool(rend.Page):
-    addSlash = True
-    docFactory = getxmlfile("reliability.xhtml")
-
-    DEFAULT_PARAMETERS = [
-        ("drive_lifetime", "8Y", "time",
-         "Average drive lifetime"),
-        ("k", 3, "int",
-         "Minimum number of shares needed to recover the file"),
-        ("R", 7, "int",
-         "Repair threshold: repair will not occur until fewer than R shares "
-         "are left"),
-        ("N", 10, "int",
-         "Total number of shares of the file generated"),
-        ("delta", "1M", "time", "Amount of time between each simulation step"),
-        ("check_period", "1M", "time",
-         "How often to run the checker and repair if fewer than R shares"),
-        ("report_period", "3M", "time",
-         "Amount of time between result rows in this report"),
-        ("report_span", "5Y", "time",
-         "Total amount of time covered by this report"),
-        ]
-
-    def parse_time(self, s):
-        if s.endswith("M"):
-            return int(s[:-1]) * MONTH
-        if s.endswith("Y"):
-            return int(s[:-1]) * YEAR
-        return int(s)
-
-    def format_time(self, s):
-        if s%YEAR == 0:
-            return "%dY" % (s/YEAR)
-        if s%MONTH == 0:
-            return "%dM" % (s/MONTH)
-        return "%d" % s
-
-    def get_parameters(self, ctx):
-        parameters = {}
-        for (name,default,argtype,description) in self.DEFAULT_PARAMETERS:
-            v = get_arg(ctx, name, default)
-            if argtype == "time":
-                value = self.parse_time(v)
-            else:
-                value = int(v)
-            parameters[name] = value
-        return parameters
-
-    def renderHTTP(self, ctx):
-        self.parameters = self.get_parameters(ctx)
-        self.results = reliability.ReliabilityModel.run(**self.parameters)
-        return rend.Page.renderHTTP(self, ctx)
-
-    def make_input(self, name, old_value):
-        return T.input(name=name, type="text", size="5",
-                       value=self.format_time(old_value))
-
-    def render_forms(self, ctx, data):
-        f = T.form(action=".", method="get")
-        table = []
-        for (name,default_value,argtype,description) in self.DEFAULT_PARAMETERS:
-            old_value = self.parameters[name]
-            i = self.make_input(name, old_value)
-            table.append(T.tr[T.td[name+":"], T.td[i], T.td[description]])
-        go = T.input(type="submit", value="Recompute")
-        return [T.h2["Simulation Parameters:"],
-                f[T.table[table], go],
-                ]
-
-    def data_simulation_table(self, ctx, data):
-        for row in self.results.samples:
-            yield row
-
-    def render_simulation_row(self, ctx, row):
-        (when, unmaintained_shareprobs, maintained_shareprobs,
-         P_repaired_last_check_period,
-         cumulative_number_of_repairs,
-         cumulative_number_of_new_shares,
-         P_dead_unmaintained, P_dead_maintained) = row
-        ctx.fillSlots("t", yandm(when))
-        ctx.fillSlots("P_repair", "%.6f" % P_repaired_last_check_period)
-        ctx.fillSlots("P_dead_unmaintained", "%.6g" % P_dead_unmaintained)
-        ctx.fillSlots("P_dead_maintained", "%.6g" % P_dead_maintained)
-        return ctx.tag
-
-    def render_report_span(self, ctx, row):
-        (when, unmaintained_shareprobs, maintained_shareprobs,
-         P_repaired_last_check_period,
-         cumulative_number_of_repairs,
-         cumulative_number_of_new_shares,
-         P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1]
-        return ctx.tag[yandm(when)]
-
-    def render_P_loss_unmaintained(self, ctx, row):
-        (when, unmaintained_shareprobs, maintained_shareprobs,
-         P_repaired_last_check_period,
-         cumulative_number_of_repairs,
-         cumulative_number_of_new_shares,
-         P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1]
-        return ctx.tag["%.6g (%1.8f%%)" % (P_dead_unmaintained,
-                                           100*P_dead_unmaintained)]
-
-    def render_P_loss_maintained(self, ctx, row):
-        (when, unmaintained_shareprobs, maintained_shareprobs,
-         P_repaired_last_check_period,
-         cumulative_number_of_repairs,
-         cumulative_number_of_new_shares,
-         P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1]
-        return ctx.tag["%.6g (%1.8f%%)" % (P_dead_maintained,
-                                           100*P_dead_maintained)]
-
-    def render_P_repair_rate(self, ctx, row):
-        (when, unmaintained_shareprobs, maintained_shareprobs,
-         P_repaired_last_check_period,
-         cumulative_number_of_repairs,
-         cumulative_number_of_new_shares,
-         P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1]
-        freq = when / cumulative_number_of_repairs
-        return ctx.tag["%.6g" % freq]
-
-    def render_P_repair_shares(self, ctx, row):
-        (when, unmaintained_shareprobs, maintained_shareprobs,
-         P_repaired_last_check_period,
-         cumulative_number_of_repairs,
-         cumulative_number_of_new_shares,
-         P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1]
-        generated_shares = cumulative_number_of_new_shares / cumulative_number_of_repairs
-        return ctx.tag["%1.2f" % generated_shares]
-
-
diff --git a/src/allmydata/web/reliability.xhtml b/src/allmydata/web/reliability.xhtml

deleted file mode 100644 (file)

index f8d93d1..0000000
--- a/src/allmydata/web/reliability.xhtml
+++ /dev/null
@@ -1,63 +0,0 @@
-<html xmlns:n="http://nevow.com/ns/nevow/0.1">
-  <head>
-    <title>Tahoe-LAFS - Reliability Tool</title>
-    <link href="/tahoe.css" rel="stylesheet" type="text/css"/>
-    <link href="/icon.png" rel="shortcut icon" />
-    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-  </head>
-  <body>
-
-<h1>Tahoe-LAFS Reliability Tool</h1>
-
-<p>Given certain assumptions, this page calculates probability of share loss
-over time, to help make informed decisions about how much redundancy and
-repair bandwidth to configure on a Tahoe-LAFS grid.</p>
-
-<div n:render="forms" />
-
-<h2>Simulation Results</h2>
-
-<p>At the end of the report span (elapsed time <span n:render="report_span"
-/>), the simulated file had the following properties:</p>
-
-<ul>
-    <li>Probability of loss (no maintenance):
-        <span n:render="P_loss_unmaintained"/></li>
-    <li>Probability of loss (with maintenance):
-        <span n:render="P_loss_maintained"/></li>
-    <li>Average repair frequency:
-        once every <span n:render="P_repair_rate"/> secs</li>
-    <li>Average shares generated per repair:
-        <span n:render="P_repair_shares"/></li>
-</ul>
-
-<p>This table shows how the following properties change over time:</p>
-<ul>
-  <li>P_repair: the chance that a repair was performed in the most recent
-  check period.</li>
-  <li>P_dead (unmaintained): the chance that the file will be unrecoverable
-  without periodic check+repair</li>
-  <li>P_dead (maintained): the chance that the file will be unrecoverable even
-  with periodic check+repair</li>
-</ul>
-
-<div>
-<table n:render="sequence" n:data="simulation_table">
-  <tr n:pattern="header">
-    <td>t</td>
-    <td>P_repair</td>
-    <td>P_dead (unmaintained)</td>
-    <td>P_dead (maintained)</td>
-  </tr>
-  <tr n:pattern="item" n:render="simulation_row">
-    <td><n:slot name="t"/></td>
-    <td><n:slot name="P_repair"/></td>
-    <td><n:slot name="P_dead_unmaintained"/></td>
-    <td><n:slot name="P_dead_maintained"/></td>
-  </tr>
-  <tr n:pattern="empty"><td>no simulation data!</td></tr>
-</table>
-</div>
-
-  </body>
-</html>
diff --git a/src/allmydata/web/root.py b/src/allmydata/web/root.py

index 615f98d1d6f21b956f4877822d3b09e126825062..47793201d72ab60c61efdbc3cf108371c55124a0 100644 (file)
--- a/src/allmydata/web/root.py
+++ b/src/allmydata/web/root.py
@@ -2,18 +2,17 @@ import time, os
  
  from twisted.internet import address
  from twisted.web import http
  
  from twisted.internet import address
  from twisted.web import http
-from nevow import rend, url, loaders, tags as T
+from nevow import rend, url, tags as T
  from nevow.inevow import IRequest
  from nevow.static import File as nevow_File # TODO: merge with static.File?
  from nevow.util import resource_filename
  
  import allmydata # to display import path
  from allmydata import get_package_versions_string
  from nevow.inevow import IRequest
  from nevow.static import File as nevow_File # TODO: merge with static.File?
  from nevow.util import resource_filename
  
  import allmydata # to display import path
  from allmydata import get_package_versions_string
-from allmydata import provisioning
  from allmydata.util import idlib, log
  from allmydata.interfaces import IFileNode
  from allmydata.web import filenode, directory, unlinked, status, operations
  from allmydata.util import idlib, log
  from allmydata.interfaces import IFileNode
  from allmydata.web import filenode, directory, unlinked, status, operations
-from allmydata.web import reliability, storage
+from allmydata.web import storage
  from allmydata.web.common import abbreviate_size, getxmlfile, WebError, \
       get_arg, RenderMixin, get_format, get_mutable_type
  
  from allmydata.web.common import abbreviate_size, getxmlfile, WebError, \
       get_arg, RenderMixin, get_format, get_mutable_type
  
@@ -126,20 +125,6 @@ class IncidentReporter(RenderMixin, rend.Page):
          req.setHeader("content-type", "text/plain")
          return "Thank you for your report!"
  
          req.setHeader("content-type", "text/plain")
          return "Thank you for your report!"
  
-class NoReliability(rend.Page):
-    docFactory = loaders.xmlstr('''\
-<html xmlns:n="http://nevow.com/ns/nevow/0.1">
-  <head>
-    <title>AllMyData - Tahoe</title>
-    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-  </head>
-  <body>
-  <h2>"Reliability" page not available</h2>
-  <p>Please install the python "NumPy" module to enable this page.</p>
-  </body>
-</html>
-''')
-
  SPACE = u"\u00A0"*2
  
  class Root(rend.Page):
  SPACE = u"\u00A0"*2
  
  class Root(rend.Page):
@@ -175,12 +160,6 @@ class Root(rend.Page):
          # needs to created on each request
          return status.HelperStatus(self.client.helper)
  
          # needs to created on each request
          return status.HelperStatus(self.client.helper)
  
-    child_provisioning = provisioning.ProvisioningTool()
-    if reliability.is_available():
-        child_reliability = reliability.ReliabilityTool()
-    else:
-        child_reliability = NoReliability()
-
      child_report_incident = IncidentReporter()
      #child_server # let's reserve this for storage-server-over-HTTP
  
      child_report_incident = IncidentReporter()
      #child_server # let's reserve this for storage-server-over-HTTP
  
diff --git a/src/allmydata/web/welcome.xhtml b/src/allmydata/web/welcome.xhtml

index b5a191e4b7e98f50ea2ec19f46b2efe060678524..6bf1debab8dce4dbf35943ba6363e458e47d7c66 100644 (file)
--- a/src/allmydata/web/welcome.xhtml
+++ b/src/allmydata/web/welcome.xhtml
@@ -91,9 +91,6 @@
    <div>Please visit the <a target="_blank" href="http://tahoe-lafs.org">Tahoe-LAFS home page</a> for
    code updates and bug reporting.</div>
  
    <div>Please visit the <a target="_blank" href="http://tahoe-lafs.org">Tahoe-LAFS home page</a> for
    code updates and bug reporting.</div>
  
-  <div>The <a href="provisioning">provisioning tool</a> and <a
-  href="reliability">reliability calculator</a> may also be useful.</div>
-
    <div n:render="incident_button" />
  </div>
  
    <div n:render="incident_button" />
  </div>
author	Brian Warner <warner@lothar.com>
	Wed, 15 Feb 2012 18:18:53 +0000 (18:18 +0000)
committer	Brian Warner <warner@lothar.com>
	Thu, 16 Feb 2012 22:29:05 +0000 (22:29 +0000)
docs/architecture.rst		patch \| blob \| history
docs/frontends/webapi.rst		patch \| blob \| history
misc/operations_helpers/provisioning/provisioning.py	[new file with mode: 0644]	patch \| blob
misc/operations_helpers/provisioning/provisioning.xhtml	[new file with mode: 0644]	patch \| blob
misc/operations_helpers/provisioning/reliability.py	[new file with mode: 0644]	patch \| blob
misc/operations_helpers/provisioning/reliability.xhtml	[new file with mode: 0644]	patch \| blob
misc/operations_helpers/provisioning/test_provisioning.py	[new file with mode: 0644]	patch \| blob
misc/operations_helpers/provisioning/web_reliability.py	[new file with mode: 0644]	patch \| blob
src/allmydata/provisioning.py	[deleted file]	patch \| blob \| history
src/allmydata/reliability.py	[deleted file]	patch \| blob \| history
src/allmydata/test/test_provisioning.py	[deleted file]	patch \| blob \| history
src/allmydata/test/test_web.py		patch \| blob \| history
src/allmydata/web/provisioning.xhtml	[deleted file]	patch \| blob \| history
src/allmydata/web/reliability.py	[deleted file]	patch \| blob \| history
src/allmydata/web/reliability.xhtml	[deleted file]	patch \| blob \| history
src/allmydata/web/root.py		patch \| blob \| history
src/allmydata/web/welcome.xhtml		patch \| blob \| history