From: Brian Warner <warner@lothar.com>
Date: Sat, 14 Feb 2009 00:42:34 +0000 (-0700)
Subject: build a 'reliability' web page, with a simulation of file decay and repair over time
X-Git-Tag: allmydata-tahoe-1.4.0~225
X-Git-Url: https://git.rkrishnan.org/pf/content/%22news.html/%3C?a=commitdiff_plain;h=e2efd911a3f399230163a07b452d61de94f00c6c;p=tahoe-lafs%2Ftahoe-lafs.git

build a 'reliability' web page, with a simulation of file decay and repair over time
---

diff --git a/src/allmydata/provisioning.py b/src/allmydata/provisioning.py
index c9cdf9a7..42986dba 100644
--- a/src/allmydata/provisioning.py
+++ b/src/allmydata/provisioning.py
@@ -707,6 +707,12 @@ class ProvisioningTool(rend.Page):
               all_sections,
               ]
 
+        try:
+            from allmydata import reliability
+            f = [T.div[T.href(a="reliability.html")["Reliability Math"]], f]
+        except ImportError:
+            pass
+
         return f
 
     def file_availability(self, k, n, server_dBA):
diff --git a/src/allmydata/reliability.py b/src/allmydata/reliability.py
new file mode 100644
index 00000000..5db6a1f2
--- /dev/null
+++ b/src/allmydata/reliability.py
@@ -0,0 +1,264 @@
+#! /usr/bin/python
+
+import math
+from allmydata.util import statistics
+import Numeric
+from Numeric import array, matrixmultiply as mm
+
+DAY=24*60*60
+MONTH=31*DAY
+YEAR=365*DAY
+
+def my_dot(v1, v2):
+    #print v1.shape, v2.shape
+    #assert len(v1.shape) == 2
+    #assert v1.shape[0] == 1
+    #assert len(v2.shape) == 2
+    #assert v2.shape[0] == 1
+    #assert v1.shape[1] == v2.shape[1]
+    #for i in range(v1.shape[1]):
+    return Numeric.sum(Numeric.sum(v1*v2))
+
+def yandm(seconds):
+    return "%dy.%dm" % (int(seconds/YEAR), int( (seconds%YEAR)/MONTH))
+
+class ReliabilityModel:
+    """Generate a model of system-wide reliability, given several input
+    parameters.
+
+    This runs a simulation in which time is quantized down to 'delta' seconds
+    (default is one month): a smaller delta will result in a more accurate
+    simulation, but will take longer to run. 'report_span' simulated seconds
+    will be run.
+
+    The encoding parameters are provided as 'k' (minimum number of shares
+    needed to recover the file) and 'N' (total number of shares generated).
+    The default parameters are 3-of-10.
+
+    The first step is to build a probability of individual drive loss during
+    any given delta. This uses a simple exponential model, in which the
+    average drive lifetime is specified by the 'drive_lifetime' parameter
+    (default is 8 years).
+
+    The second step is to calculate a 'transition matrix': a table of
+    probabilities that shows, given A shares at the start of the delta, what
+    the chances are of having B shares left at the end of the delta. The
+    current code optimistically assumes all drives are independent. A
+    subclass could override that assumption.
+
+    An additional 'repair matrix' is created to show what happens when the
+    Checker/Repairer is run. In the simulation, the Checker will be run every
+    'check_period' seconds (default is one month), and the Repairer will be
+    run if it sees fewer than 'R' shares (default 7).
+
+    The third step is to finally run the simulation. An initial probability
+    vector is created (with a 100% chance of N shares and a 0% chance of
+    fewer than N shares), then it is multiplied by the transition matrix for
+    every delta of time. Each time the Checker is to be run, the repair
+    matrix is multiplied in, and some additional stats are accumulated
+    (average number of repairs that occur, average number of shares
+    regenerated per repair).
+
+    The output is a ReliabilityReport instance, which contains a table that
+    samples the state of the simulation once each 'report_period' seconds
+    (defaults to 3 months). Each row of this table will contain the
+    probability vector for one sample period (chance of having X shares, from
+    0 to N, at the end of the period). The report will also contain other
+    information.
+
+    """
+
+    @classmethod
+    def run(klass,
+            drive_lifetime=8*YEAR,
+            k=3, R=7, N=10,
+            delta=1*MONTH,
+            check_period=1*MONTH,
+            report_period=3*MONTH,
+            report_span=5*YEAR,
+            ):
+        self = klass()
+
+        check_period = check_period-1
+        P = self.p_in_period(drive_lifetime, delta)
+
+        decay = self.build_decay_matrix(N, P)
+
+        repair = self.build_repair_matrix(k, N, R)
+
+        #print "DECAY:", decay
+        #print "OLD-POST-REPAIR:", old_post_repair
+        #print "NEW-POST-REPAIR:", mm(decay, repair)
+        #print "REPAIR:", repair
+        #print "DIFF:", (old_post_repair - mm(decay, repair))
+
+        START = array([[0]*N + [1]])
+        ALIVE = array([[0]*k + [1]*(1+N-k)])
+        DEAD = array([[1]*k + [0]*(1+N-k)])
+        REPAIRp = array([[0]*k + [1]*(R-k) + [0]*(1+N-R)])
+        REPAIR_newshares = array([[0]*k +
+                                  [N-i for i in range(k, R)] +
+                                  [0]*(1+N-R)])
+        assert REPAIR_newshares.shape[1] == N+1
+        #print "START", START
+        #print "ALIVE", ALIVE
+        #print "REPAIRp", REPAIRp
+        #print "REPAIR_newshares", REPAIR_newshares
+
+        unmaintained_state = START
+        maintained_state = START
+        last_check = 0
+        last_report = 0
+        P_repaired_last_check_period = 0.0
+        needed_repairs = []
+        needed_new_shares = []
+        report = ReliabilityReport()
+
+        for t in range(0, report_span+delta, delta):
+            unmaintained_state = mm(unmaintained_state, decay)
+            maintained_state = mm(maintained_state, decay)
+            if (t-last_check) > check_period:
+                last_check = t
+                # we do a check-and-repair this frequently
+                need_repair = my_dot(maintained_state, REPAIRp)
+
+                P_repaired_last_check_period = need_repair
+                new_shares = my_dot(maintained_state, REPAIR_newshares)
+                needed_repairs.append(need_repair)
+                needed_new_shares.append(new_shares)
+
+                maintained_state = mm(maintained_state, repair)
+
+            if (t-last_report) > report_period:
+                last_report = t
+                P_dead_unmaintained = my_dot(unmaintained_state, DEAD)
+                P_dead_maintained = my_dot(maintained_state, DEAD)
+                cumulative_number_of_repairs = sum(needed_repairs)
+                cumulative_number_of_new_shares = sum(needed_new_shares)
+                report.add_sample(t, unmaintained_state, maintained_state,
+                                  P_repaired_last_check_period,
+                                  cumulative_number_of_repairs,
+                                  cumulative_number_of_new_shares,
+                                  P_dead_unmaintained, P_dead_maintained)
+
+        # record one more sample at the end of the run
+        P_dead_unmaintained = my_dot(unmaintained_state, DEAD)
+        P_dead_maintained = my_dot(maintained_state, DEAD)
+        cumulative_number_of_repairs = sum(needed_repairs)
+        cumulative_number_of_new_shares = sum(needed_new_shares)
+        report.add_sample(t, unmaintained_state, maintained_state,
+                          P_repaired_last_check_period,
+                          cumulative_number_of_repairs,
+                          cumulative_number_of_new_shares,
+                          P_dead_unmaintained, P_dead_maintained)
+
+        #needed_repairs_total = sum(needed_repairs)
+        #needed_new_shares_total = sum(needed_new_shares)
+        #print "at 2y:"
+        #print " unmaintained", unmaintained_state
+        #print " maintained", maintained_state
+        #print " number of repairs", needed_repairs_total
+        #print " new shares generated", needed_new_shares_total
+        #repair_rate_inv = report_span / needed_repairs_total
+        #print "  avg repair rate: once every %s" % yandm(repair_rate_inv)
+        #print "  avg repair download: one share every %s" % yandm(repair_rate_inv/k)
+        #print "  avg repair upload: one share every %s" % yandm(report_span / needed_new_shares_total)
+
+        return report
+
+    def p_in_period(self, avg_lifetime, period):
+        """Given an average lifetime of a disk (using an exponential model),
+        what is the chance that a live disk will survive the next 'period'
+        seconds?"""
+
+        # eg p_in_period(8*YEAR, MONTH) = 98.94%
+        return math.exp(-1.0*period/avg_lifetime)
+
+    def build_decay_matrix(self, N, P):
+        """Return a decay matrix. decay[start_shares][end_shares] is the
+        conditional probability of finishing with end_shares, given that we
+        started with start_shares."""
+        decay_rows = []
+        decay_rows.append( [0.0]*(N+1) )
+        for start_shares in range(1, (N+1)):
+            end_shares = self.build_decay_row(start_shares, P)
+            decay_row = end_shares + [0.0] * (N-start_shares)
+            assert len(decay_row) == (N+1), len(decay_row)
+            decay_rows.append(decay_row)
+
+        decay = array(decay_rows)
+        return decay
+
+    def build_decay_row(self, start_shares, P):
+        """Return a decay row 'end_shares'. end_shares[i] is the chance that
+        we finish with i shares, given that we started with start_shares, for
+        all i between 0 and start_shares, inclusive. This implementation
+        assumes that all shares are independent (IID), but a more complex
+        model could incorporate inter-share failure correlations like having
+        two shares on the same server."""
+        end_shares = statistics.binomial_distribution_pmf(start_shares, P)
+        return end_shares
+
+    def build_repair_matrix(self, k, N, R):
+        """Return a repair matrix. repair[start][end]: is the conditional
+        probability of the repairer finishing with 'end' shares, given that
+        it began with 'start' shares (repair if fewer than R shares). The
+        repairer's behavior is deterministic, so all values in this matrix
+        are either 0 or 1. This matrix should be applied *after* the decay
+        matrix."""
+        new_repair_rows = []
+        for start_shares in range(0, N+1):
+            new_repair_row = [0] * (N+1)
+            if start_shares < k:
+                new_repair_row[start_shares] = 1
+            elif start_shares < R:
+                new_repair_row[N] = 1
+            else:
+                new_repair_row[start_shares] = 1
+            new_repair_rows.append(new_repair_row)
+
+        repair = array(new_repair_rows)
+        return repair
+
+class ReliabilityReport:
+    def __init__(self):
+        self.samples = []
+
+    def add_sample(self, when, unmaintained_shareprobs, maintained_shareprobs,
+                   P_repaired_last_check_period,
+                   cumulative_number_of_repairs,
+                   cumulative_number_of_new_shares,
+                   P_dead_unmaintained, P_dead_maintained):
+        """
+        when: the timestamp at the end of the report period
+        unmaintained_shareprobs: a vector of probabilities, element[S]
+                                 is the chance that there are S shares
+                                 left at the end of the report period.
+                                 This tracks what happens if no repair
+                                 is ever done.
+        maintained_shareprobs: same, but for 'maintained' grids, where
+                               check and repair is done at the end
+                               of each check period
+        P_repaired_last_check_period: a float, with the probability
+                                      that a repair was performed
+                                      at the end of the most recent
+                                      check period.
+        cumulative_number_of_repairs: a float, with the average number
+                                      of repairs that will have been
+                                      performed by the end of the
+                                      report period
+        cumulative_number_of_new_shares: a float, with the average number
+                                         of new shares that repair proceses
+                                         generated by the end of the report
+                                         period
+        P_dead_unmaintained: a float, with the chance that the file will
+                             be unrecoverable at the end of the period
+        P_dead_maintained: same, but for maintained grids
+
+        """
+        row = (when, unmaintained_shareprobs, maintained_shareprobs,
+               P_repaired_last_check_period,
+               cumulative_number_of_repairs,
+               cumulative_number_of_new_shares,
+               P_dead_unmaintained, P_dead_maintained)
+        self.samples.append(row)
diff --git a/src/allmydata/web/reliability.py b/src/allmydata/web/reliability.py
new file mode 100644
index 00000000..b91ed854
--- /dev/null
+++ b/src/allmydata/web/reliability.py
@@ -0,0 +1,144 @@
+
+from nevow import rend, inevow, tags as T
+reliability = None # might not be usable
+try:
+    from allmydata import reliability # requires Numeric and PIL
+except ImportError:
+    pass
+from allmydata.web.common import getxmlfile, get_arg
+
+
+DAY=24*60*60
+MONTH=31*DAY
+YEAR=365*DAY
+
+def yandm(seconds):
+    return "%dy.%dm" % (int(seconds/YEAR), int( (seconds%YEAR)/MONTH))
+
+class ReliabilityTool(rend.Page):
+    addSlash = True
+    docFactory = getxmlfile("reliability.xhtml")
+
+    DEFAULT_PARAMETERS = [
+        ("drive_lifetime", "8Y", "time"),
+        ("k", 3, "int"),
+        ("R", 7, "int"),
+        ("N", 10, "int"),
+        ("delta", "1M", "time"),
+        ("check_period", "1M", "time"),
+        ("report_period", "3M", "time"),
+        ("report_span", "5Y", "time"),
+        ]
+
+    def parse_time(self, s):
+        if s.endswith("M"):
+            return int(s[:-1]) * MONTH
+        if s.endswith("Y"):
+            return int(s[:-1]) * YEAR
+        return int(s)
+
+    def format_time(self, s):
+        if s%YEAR == 0:
+            return "%dY" % (s/YEAR)
+        if s%MONTH == 0:
+            return "%dM" % (s/MONTH)
+        return "%d" % s
+
+    def get_parameters(self, ctx):
+        req = inevow.IRequest(ctx)
+        parameters = {}
+        for name,default,argtype in self.DEFAULT_PARAMETERS:
+            v = get_arg(ctx, name, default)
+            if argtype == "time":
+                value = self.parse_time(v)
+            else:
+                value = int(v)
+            parameters[name] = value
+        return parameters
+
+    def renderHTTP(self, ctx):
+        print "renderHTTP"
+        print "two"
+        self.parameters = self.get_parameters(ctx)
+        print "parms", self.parameters
+        self.results = reliability.ReliabilityModel.run(**self.parameters)
+        print "got results"
+        return rend.Page.renderHTTP(self, ctx)
+
+    def make_input(self, name, old_value):
+        return T.input(name=name, type="text",
+                       value=self.format_time(old_value))
+
+    def render_forms(self, ctx, data):
+        f = T.form(action=".", method="get")
+        table = []
+        for name, default_value, argtype in self.DEFAULT_PARAMETERS:
+            old_value = self.parameters[name]
+            i = self.make_input(name, old_value)
+            table.append(T.tr[T.td[name+":"], T.td[i]])
+        go = T.input(type="submit", value="Recompute")
+        return [T.h2["Simulation Parameters:"],
+                f[T.table[table], go],
+                ]
+
+    def data_simulation_table(self, ctx, data):
+        for row in self.results.samples:
+            yield row
+
+    def render_simulation_row(self, ctx, row):
+        (when, unmaintained_shareprobs, maintained_shareprobs,
+         P_repaired_last_check_period,
+         cumulative_number_of_repairs,
+         cumulative_number_of_new_shares,
+         P_dead_unmaintained, P_dead_maintained) = row
+        ctx.fillSlots("t", yandm(when))
+        ctx.fillSlots("P_repair", "%.6f" % P_repaired_last_check_period)
+        ctx.fillSlots("P_dead_unmaintained", "%.6g" % P_dead_unmaintained)
+        ctx.fillSlots("P_dead_maintained", "%.6g" % P_dead_maintained)
+        return ctx.tag
+
+    def render_report_span(self, ctx, row):
+        (when, unmaintained_shareprobs, maintained_shareprobs,
+         P_repaired_last_check_period,
+         cumulative_number_of_repairs,
+         cumulative_number_of_new_shares,
+         P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1]
+        return ctx.tag[yandm(when)]
+
+    def render_P_loss_unmaintained(self, ctx, row):
+        (when, unmaintained_shareprobs, maintained_shareprobs,
+         P_repaired_last_check_period,
+         cumulative_number_of_repairs,
+         cumulative_number_of_new_shares,
+         P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1]
+        return ctx.tag["%.6g (%1.8f%%)" % (P_dead_unmaintained,
+                                           100*P_dead_unmaintained)]
+
+    def render_P_loss_maintained(self, ctx, row):
+        (when, unmaintained_shareprobs, maintained_shareprobs,
+         P_repaired_last_check_period,
+         cumulative_number_of_repairs,
+         cumulative_number_of_new_shares,
+         P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1]
+        return ctx.tag["%.6g (%1.8f%%)" % (P_dead_maintained,
+                                           100*P_dead_maintained)]
+
+    def render_P_repair_rate(self, ctx, row):
+        (when, unmaintained_shareprobs, maintained_shareprobs,
+         P_repaired_last_check_period,
+         cumulative_number_of_repairs,
+         cumulative_number_of_new_shares,
+         P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1]
+        freq = when / cumulative_number_of_repairs
+        return ctx.tag["%.6g" % freq]
+
+    def render_P_repair_shares(self, ctx, row):
+        (when, unmaintained_shareprobs, maintained_shareprobs,
+         P_repaired_last_check_period,
+         cumulative_number_of_repairs,
+         cumulative_number_of_new_shares,
+         P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1]
+        generated_shares = cumulative_number_of_new_shares / cumulative_number_of_repairs
+        return ctx.tag["%1.2f" % generated_shares]
+
+
diff --git a/src/allmydata/web/reliability.xhtml b/src/allmydata/web/reliability.xhtml
new file mode 100644
index 00000000..d8502031
--- /dev/null
+++ b/src/allmydata/web/reliability.xhtml
@@ -0,0 +1,64 @@
+<html xmlns:n="http://nevow.com/ns/nevow/0.1">
+  <head>
+    <title>AllMyData - Tahoe - Provisioning Tool</title>
+    <!-- <link href="http://www.allmydata.com/common/css/styles.css"
+          rel="stylesheet" type="text/css"/> -->
+    <link href="/webform_css" rel="stylesheet" type="text/css"/>
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+  </head>
+  <body>
+
+<h1>Tahoe Reliability Tool</h1>
+
+<p>Given certain assumptions, this page calculates probability of share loss
+over time, to help make informed decisions about how much redundancy and
+repair bandwidth to configure on a Tahoe grid.</p>
+
+<div n:render="forms" />
+
+<h2>Simulation Results</h2>
+
+<p>At the end of the report span (elapsed time <span n:render="report_span"
+/>), the simulated file had the following properties:</p>
+
+<ul>
+    <li>Probability of loss (no maintenance):
+        <span n:render="P_loss_unmaintained"/></li>
+    <li>Probability of loss (with maintenance):
+        <span n:render="P_loss_maintained"/></li>
+    <li>Average repair frequency:
+        once every <span n:render="P_repair_rate"/> secs</li>
+    <li>Average shares generated per repair:
+        <span n:render="P_repair_shares"/></li>
+</ul>
+
+<p>This table shows how the following properties change over time:</p>
+<ul>
+  <li>P_repair: the chance that a repair was performed in the most recent
+  check period.</li>
+  <li>P_dead (unmaintained): the chance that the file will be unrecoverable
+  without periodic check+repair</li>
+  <li>P_dead (maintained): the chance that the file will be recoverable even
+  with periodic check+repair</li>
+</ul>
+
+<div>
+<table n:render="sequence" n:data="simulation_table" border="1">
+  <tr n:pattern="header">
+    <td>t</td>
+    <td>P_repair</td>
+    <td>P_dead (unmaintained)</td>
+    <td>P_dead (maintained)</td>
+  </tr>
+  <tr n:pattern="item" n:render="simulation_row">
+    <td><n:slot name="t"/></td>
+    <td><n:slot name="P_repair"/></td>
+    <td><n:slot name="P_dead_unmaintained"/></td>
+    <td><n:slot name="P_dead_maintained"/></td>
+  </tr>
+  <tr n:pattern="empty"><td>no simulation data!</td></tr>
+</table>
+</div>
+
+  </body>
+</html>
diff --git a/src/allmydata/web/root.py b/src/allmydata/web/root.py
index 81f6b367..b8f7b1a8 100644
--- a/src/allmydata/web/root.py
+++ b/src/allmydata/web/root.py
@@ -2,7 +2,7 @@ import time
 
 from twisted.internet import address
 from twisted.web import http
-from nevow import rend, url, tags as T
+from nevow import rend, url, loaders, tags as T
 from nevow.inevow import IRequest
 from nevow.static import File as nevow_File # TODO: merge with static.File?
 from nevow.util import resource_filename
@@ -11,6 +11,11 @@ from formless import webform
 import allmydata # to display import path
 from allmydata import get_package_versions_string
 from allmydata import provisioning
+reliability = None
+try:
+    from allmydata.web import reliability # requires Numeric and PIL
+except ImportError:
+    pass # might not be usable
 from allmydata.util import idlib, log
 from allmydata.interfaces import IFileNode
 from allmydata.web import filenode, directory, unlinked, status, operations
@@ -111,6 +116,20 @@ class IncidentReporter(RenderMixin, rend.Page):
         req.setHeader("content-type", "text/plain")
         return "Thank you for your report!"
 
+class NoReliability(rend.Page):
+    docFactory = loaders.xmlstr('''\
+<html xmlns:n="http://nevow.com/ns/nevow/0.1">
+  <head>
+    <title>AllMyData - Tahoe</title>
+    <link href="/webform_css" rel="stylesheet" type="text/css"/>
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+  </head>
+  <body>
+  <h2>"Reliability" page not available</h2>
+  <p>Please install the python "Numeric" module to enable this page.</p>
+  </body>
+</html>
+''')
 
 class Root(rend.Page):
 
@@ -130,6 +149,10 @@ class Root(rend.Page):
     child_tahoe_css = nevow_File(resource_filename('allmydata.web', 'tahoe.css'))
 
     child_provisioning = provisioning.ProvisioningTool()
+    if reliability:
+        child_reliability = reliability.ReliabilityTool()
+    else:
+        child_reliability = NoReliability()
     child_status = status.Status()
     child_helper_status = status.HelperStatus()
     child_statistics = status.Statistics()
diff --git a/src/allmydata/web/welcome.xhtml b/src/allmydata/web/welcome.xhtml
index d0acb197..2b8c7ad1 100644
--- a/src/allmydata/web/welcome.xhtml
+++ b/src/allmydata/web/welcome.xhtml
@@ -75,7 +75,8 @@
 <div>Please visit the <a href="http://allmydata.org">Tahoe home page</a> for
 code updates and bug reporting.</div>
 
-<div>The <a href="provisioning">provisioning tool</a> may also be useful.</div>
+<div>The <a href="provisioning">provisioning tool</a> and <a
+href="reliability">reliability calculator</a> may also be useful.</div>
 
 <div n:render="incident_button" />