From: Brian Warner Date: Sat, 14 Feb 2009 00:42:34 +0000 (-0700) Subject: build a 'reliability' web page, with a simulation of file decay and repair over time X-Git-Tag: allmydata-tahoe-1.4.0~225 X-Git-Url: https://git.rkrishnan.org/Site/Content/Exhibitors/running.html?a=commitdiff_plain;h=e2efd911a3f399230163a07b452d61de94f00c6c;p=tahoe-lafs%2Ftahoe-lafs.git build a 'reliability' web page, with a simulation of file decay and repair over time --- diff --git a/src/allmydata/provisioning.py b/src/allmydata/provisioning.py index c9cdf9a7..42986dba 100644 --- a/src/allmydata/provisioning.py +++ b/src/allmydata/provisioning.py @@ -707,6 +707,12 @@ class ProvisioningTool(rend.Page): all_sections, ] + try: + from allmydata import reliability + f = [T.div[T.href(a="reliability.html")["Reliability Math"]], f] + except ImportError: + pass + return f def file_availability(self, k, n, server_dBA): diff --git a/src/allmydata/reliability.py b/src/allmydata/reliability.py new file mode 100644 index 00000000..5db6a1f2 --- /dev/null +++ b/src/allmydata/reliability.py @@ -0,0 +1,264 @@ +#! /usr/bin/python + +import math +from allmydata.util import statistics +import Numeric +from Numeric import array, matrixmultiply as mm + +DAY=24*60*60 +MONTH=31*DAY +YEAR=365*DAY + +def my_dot(v1, v2): + #print v1.shape, v2.shape + #assert len(v1.shape) == 2 + #assert v1.shape[0] == 1 + #assert len(v2.shape) == 2 + #assert v2.shape[0] == 1 + #assert v1.shape[1] == v2.shape[1] + #for i in range(v1.shape[1]): + return Numeric.sum(Numeric.sum(v1*v2)) + +def yandm(seconds): + return "%dy.%dm" % (int(seconds/YEAR), int( (seconds%YEAR)/MONTH)) + +class ReliabilityModel: + """Generate a model of system-wide reliability, given several input + parameters. + + This runs a simulation in which time is quantized down to 'delta' seconds + (default is one month): a smaller delta will result in a more accurate + simulation, but will take longer to run. 'report_span' simulated seconds + will be run. + + The encoding parameters are provided as 'k' (minimum number of shares + needed to recover the file) and 'N' (total number of shares generated). + The default parameters are 3-of-10. + + The first step is to build a probability of individual drive loss during + any given delta. This uses a simple exponential model, in which the + average drive lifetime is specified by the 'drive_lifetime' parameter + (default is 8 years). + + The second step is to calculate a 'transition matrix': a table of + probabilities that shows, given A shares at the start of the delta, what + the chances are of having B shares left at the end of the delta. The + current code optimistically assumes all drives are independent. A + subclass could override that assumption. + + An additional 'repair matrix' is created to show what happens when the + Checker/Repairer is run. In the simulation, the Checker will be run every + 'check_period' seconds (default is one month), and the Repairer will be + run if it sees fewer than 'R' shares (default 7). + + The third step is to finally run the simulation. An initial probability + vector is created (with a 100% chance of N shares and a 0% chance of + fewer than N shares), then it is multiplied by the transition matrix for + every delta of time. Each time the Checker is to be run, the repair + matrix is multiplied in, and some additional stats are accumulated + (average number of repairs that occur, average number of shares + regenerated per repair). + + The output is a ReliabilityReport instance, which contains a table that + samples the state of the simulation once each 'report_period' seconds + (defaults to 3 months). Each row of this table will contain the + probability vector for one sample period (chance of having X shares, from + 0 to N, at the end of the period). The report will also contain other + information. + + """ + + @classmethod + def run(klass, + drive_lifetime=8*YEAR, + k=3, R=7, N=10, + delta=1*MONTH, + check_period=1*MONTH, + report_period=3*MONTH, + report_span=5*YEAR, + ): + self = klass() + + check_period = check_period-1 + P = self.p_in_period(drive_lifetime, delta) + + decay = self.build_decay_matrix(N, P) + + repair = self.build_repair_matrix(k, N, R) + + #print "DECAY:", decay + #print "OLD-POST-REPAIR:", old_post_repair + #print "NEW-POST-REPAIR:", mm(decay, repair) + #print "REPAIR:", repair + #print "DIFF:", (old_post_repair - mm(decay, repair)) + + START = array([[0]*N + [1]]) + ALIVE = array([[0]*k + [1]*(1+N-k)]) + DEAD = array([[1]*k + [0]*(1+N-k)]) + REPAIRp = array([[0]*k + [1]*(R-k) + [0]*(1+N-R)]) + REPAIR_newshares = array([[0]*k + + [N-i for i in range(k, R)] + + [0]*(1+N-R)]) + assert REPAIR_newshares.shape[1] == N+1 + #print "START", START + #print "ALIVE", ALIVE + #print "REPAIRp", REPAIRp + #print "REPAIR_newshares", REPAIR_newshares + + unmaintained_state = START + maintained_state = START + last_check = 0 + last_report = 0 + P_repaired_last_check_period = 0.0 + needed_repairs = [] + needed_new_shares = [] + report = ReliabilityReport() + + for t in range(0, report_span+delta, delta): + unmaintained_state = mm(unmaintained_state, decay) + maintained_state = mm(maintained_state, decay) + if (t-last_check) > check_period: + last_check = t + # we do a check-and-repair this frequently + need_repair = my_dot(maintained_state, REPAIRp) + + P_repaired_last_check_period = need_repair + new_shares = my_dot(maintained_state, REPAIR_newshares) + needed_repairs.append(need_repair) + needed_new_shares.append(new_shares) + + maintained_state = mm(maintained_state, repair) + + if (t-last_report) > report_period: + last_report = t + P_dead_unmaintained = my_dot(unmaintained_state, DEAD) + P_dead_maintained = my_dot(maintained_state, DEAD) + cumulative_number_of_repairs = sum(needed_repairs) + cumulative_number_of_new_shares = sum(needed_new_shares) + report.add_sample(t, unmaintained_state, maintained_state, + P_repaired_last_check_period, + cumulative_number_of_repairs, + cumulative_number_of_new_shares, + P_dead_unmaintained, P_dead_maintained) + + # record one more sample at the end of the run + P_dead_unmaintained = my_dot(unmaintained_state, DEAD) + P_dead_maintained = my_dot(maintained_state, DEAD) + cumulative_number_of_repairs = sum(needed_repairs) + cumulative_number_of_new_shares = sum(needed_new_shares) + report.add_sample(t, unmaintained_state, maintained_state, + P_repaired_last_check_period, + cumulative_number_of_repairs, + cumulative_number_of_new_shares, + P_dead_unmaintained, P_dead_maintained) + + #needed_repairs_total = sum(needed_repairs) + #needed_new_shares_total = sum(needed_new_shares) + #print "at 2y:" + #print " unmaintained", unmaintained_state + #print " maintained", maintained_state + #print " number of repairs", needed_repairs_total + #print " new shares generated", needed_new_shares_total + #repair_rate_inv = report_span / needed_repairs_total + #print " avg repair rate: once every %s" % yandm(repair_rate_inv) + #print " avg repair download: one share every %s" % yandm(repair_rate_inv/k) + #print " avg repair upload: one share every %s" % yandm(report_span / needed_new_shares_total) + + return report + + def p_in_period(self, avg_lifetime, period): + """Given an average lifetime of a disk (using an exponential model), + what is the chance that a live disk will survive the next 'period' + seconds?""" + + # eg p_in_period(8*YEAR, MONTH) = 98.94% + return math.exp(-1.0*period/avg_lifetime) + + def build_decay_matrix(self, N, P): + """Return a decay matrix. decay[start_shares][end_shares] is the + conditional probability of finishing with end_shares, given that we + started with start_shares.""" + decay_rows = [] + decay_rows.append( [0.0]*(N+1) ) + for start_shares in range(1, (N+1)): + end_shares = self.build_decay_row(start_shares, P) + decay_row = end_shares + [0.0] * (N-start_shares) + assert len(decay_row) == (N+1), len(decay_row) + decay_rows.append(decay_row) + + decay = array(decay_rows) + return decay + + def build_decay_row(self, start_shares, P): + """Return a decay row 'end_shares'. end_shares[i] is the chance that + we finish with i shares, given that we started with start_shares, for + all i between 0 and start_shares, inclusive. This implementation + assumes that all shares are independent (IID), but a more complex + model could incorporate inter-share failure correlations like having + two shares on the same server.""" + end_shares = statistics.binomial_distribution_pmf(start_shares, P) + return end_shares + + def build_repair_matrix(self, k, N, R): + """Return a repair matrix. repair[start][end]: is the conditional + probability of the repairer finishing with 'end' shares, given that + it began with 'start' shares (repair if fewer than R shares). The + repairer's behavior is deterministic, so all values in this matrix + are either 0 or 1. This matrix should be applied *after* the decay + matrix.""" + new_repair_rows = [] + for start_shares in range(0, N+1): + new_repair_row = [0] * (N+1) + if start_shares < k: + new_repair_row[start_shares] = 1 + elif start_shares < R: + new_repair_row[N] = 1 + else: + new_repair_row[start_shares] = 1 + new_repair_rows.append(new_repair_row) + + repair = array(new_repair_rows) + return repair + +class ReliabilityReport: + def __init__(self): + self.samples = [] + + def add_sample(self, when, unmaintained_shareprobs, maintained_shareprobs, + P_repaired_last_check_period, + cumulative_number_of_repairs, + cumulative_number_of_new_shares, + P_dead_unmaintained, P_dead_maintained): + """ + when: the timestamp at the end of the report period + unmaintained_shareprobs: a vector of probabilities, element[S] + is the chance that there are S shares + left at the end of the report period. + This tracks what happens if no repair + is ever done. + maintained_shareprobs: same, but for 'maintained' grids, where + check and repair is done at the end + of each check period + P_repaired_last_check_period: a float, with the probability + that a repair was performed + at the end of the most recent + check period. + cumulative_number_of_repairs: a float, with the average number + of repairs that will have been + performed by the end of the + report period + cumulative_number_of_new_shares: a float, with the average number + of new shares that repair proceses + generated by the end of the report + period + P_dead_unmaintained: a float, with the chance that the file will + be unrecoverable at the end of the period + P_dead_maintained: same, but for maintained grids + + """ + row = (when, unmaintained_shareprobs, maintained_shareprobs, + P_repaired_last_check_period, + cumulative_number_of_repairs, + cumulative_number_of_new_shares, + P_dead_unmaintained, P_dead_maintained) + self.samples.append(row) diff --git a/src/allmydata/web/reliability.py b/src/allmydata/web/reliability.py new file mode 100644 index 00000000..b91ed854 --- /dev/null +++ b/src/allmydata/web/reliability.py @@ -0,0 +1,144 @@ + +from nevow import rend, inevow, tags as T +reliability = None # might not be usable +try: + from allmydata import reliability # requires Numeric and PIL +except ImportError: + pass +from allmydata.web.common import getxmlfile, get_arg + + +DAY=24*60*60 +MONTH=31*DAY +YEAR=365*DAY + +def yandm(seconds): + return "%dy.%dm" % (int(seconds/YEAR), int( (seconds%YEAR)/MONTH)) + +class ReliabilityTool(rend.Page): + addSlash = True + docFactory = getxmlfile("reliability.xhtml") + + DEFAULT_PARAMETERS = [ + ("drive_lifetime", "8Y", "time"), + ("k", 3, "int"), + ("R", 7, "int"), + ("N", 10, "int"), + ("delta", "1M", "time"), + ("check_period", "1M", "time"), + ("report_period", "3M", "time"), + ("report_span", "5Y", "time"), + ] + + def parse_time(self, s): + if s.endswith("M"): + return int(s[:-1]) * MONTH + if s.endswith("Y"): + return int(s[:-1]) * YEAR + return int(s) + + def format_time(self, s): + if s%YEAR == 0: + return "%dY" % (s/YEAR) + if s%MONTH == 0: + return "%dM" % (s/MONTH) + return "%d" % s + + def get_parameters(self, ctx): + req = inevow.IRequest(ctx) + parameters = {} + for name,default,argtype in self.DEFAULT_PARAMETERS: + v = get_arg(ctx, name, default) + if argtype == "time": + value = self.parse_time(v) + else: + value = int(v) + parameters[name] = value + return parameters + + def renderHTTP(self, ctx): + print "renderHTTP" + print "two" + self.parameters = self.get_parameters(ctx) + print "parms", self.parameters + self.results = reliability.ReliabilityModel.run(**self.parameters) + print "got results" + return rend.Page.renderHTTP(self, ctx) + + def make_input(self, name, old_value): + return T.input(name=name, type="text", + value=self.format_time(old_value)) + + def render_forms(self, ctx, data): + f = T.form(action=".", method="get") + table = [] + for name, default_value, argtype in self.DEFAULT_PARAMETERS: + old_value = self.parameters[name] + i = self.make_input(name, old_value) + table.append(T.tr[T.td[name+":"], T.td[i]]) + go = T.input(type="submit", value="Recompute") + return [T.h2["Simulation Parameters:"], + f[T.table[table], go], + ] + + def data_simulation_table(self, ctx, data): + for row in self.results.samples: + yield row + + def render_simulation_row(self, ctx, row): + (when, unmaintained_shareprobs, maintained_shareprobs, + P_repaired_last_check_period, + cumulative_number_of_repairs, + cumulative_number_of_new_shares, + P_dead_unmaintained, P_dead_maintained) = row + ctx.fillSlots("t", yandm(when)) + ctx.fillSlots("P_repair", "%.6f" % P_repaired_last_check_period) + ctx.fillSlots("P_dead_unmaintained", "%.6g" % P_dead_unmaintained) + ctx.fillSlots("P_dead_maintained", "%.6g" % P_dead_maintained) + return ctx.tag + + def render_report_span(self, ctx, row): + (when, unmaintained_shareprobs, maintained_shareprobs, + P_repaired_last_check_period, + cumulative_number_of_repairs, + cumulative_number_of_new_shares, + P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1] + return ctx.tag[yandm(when)] + + def render_P_loss_unmaintained(self, ctx, row): + (when, unmaintained_shareprobs, maintained_shareprobs, + P_repaired_last_check_period, + cumulative_number_of_repairs, + cumulative_number_of_new_shares, + P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1] + return ctx.tag["%.6g (%1.8f%%)" % (P_dead_unmaintained, + 100*P_dead_unmaintained)] + + def render_P_loss_maintained(self, ctx, row): + (when, unmaintained_shareprobs, maintained_shareprobs, + P_repaired_last_check_period, + cumulative_number_of_repairs, + cumulative_number_of_new_shares, + P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1] + return ctx.tag["%.6g (%1.8f%%)" % (P_dead_maintained, + 100*P_dead_maintained)] + + def render_P_repair_rate(self, ctx, row): + (when, unmaintained_shareprobs, maintained_shareprobs, + P_repaired_last_check_period, + cumulative_number_of_repairs, + cumulative_number_of_new_shares, + P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1] + freq = when / cumulative_number_of_repairs + return ctx.tag["%.6g" % freq] + + def render_P_repair_shares(self, ctx, row): + (when, unmaintained_shareprobs, maintained_shareprobs, + P_repaired_last_check_period, + cumulative_number_of_repairs, + cumulative_number_of_new_shares, + P_dead_unmaintained, P_dead_maintained) = self.results.samples[-1] + generated_shares = cumulative_number_of_new_shares / cumulative_number_of_repairs + return ctx.tag["%1.2f" % generated_shares] + + diff --git a/src/allmydata/web/reliability.xhtml b/src/allmydata/web/reliability.xhtml new file mode 100644 index 00000000..d8502031 --- /dev/null +++ b/src/allmydata/web/reliability.xhtml @@ -0,0 +1,64 @@ + + + AllMyData - Tahoe - Provisioning Tool + + + + + + +

Tahoe Reliability Tool

+ +

Given certain assumptions, this page calculates probability of share loss +over time, to help make informed decisions about how much redundancy and +repair bandwidth to configure on a Tahoe grid.

+ +
+ +

Simulation Results

+ +

At the end of the report span (elapsed time ), the simulated file had the following properties:

+ + + +

This table shows how the following properties change over time:

+ + +
+ + + + + + + + + + + + + + +
tP_repairP_dead (unmaintained)P_dead (maintained)
no simulation data!
+
+ + + diff --git a/src/allmydata/web/root.py b/src/allmydata/web/root.py index 81f6b367..b8f7b1a8 100644 --- a/src/allmydata/web/root.py +++ b/src/allmydata/web/root.py @@ -2,7 +2,7 @@ import time from twisted.internet import address from twisted.web import http -from nevow import rend, url, tags as T +from nevow import rend, url, loaders, tags as T from nevow.inevow import IRequest from nevow.static import File as nevow_File # TODO: merge with static.File? from nevow.util import resource_filename @@ -11,6 +11,11 @@ from formless import webform import allmydata # to display import path from allmydata import get_package_versions_string from allmydata import provisioning +reliability = None +try: + from allmydata.web import reliability # requires Numeric and PIL +except ImportError: + pass # might not be usable from allmydata.util import idlib, log from allmydata.interfaces import IFileNode from allmydata.web import filenode, directory, unlinked, status, operations @@ -111,6 +116,20 @@ class IncidentReporter(RenderMixin, rend.Page): req.setHeader("content-type", "text/plain") return "Thank you for your report!" +class NoReliability(rend.Page): + docFactory = loaders.xmlstr('''\ + + + AllMyData - Tahoe + + + + +

"Reliability" page not available

+

Please install the python "Numeric" module to enable this page.

+ + +''') class Root(rend.Page): @@ -130,6 +149,10 @@ class Root(rend.Page): child_tahoe_css = nevow_File(resource_filename('allmydata.web', 'tahoe.css')) child_provisioning = provisioning.ProvisioningTool() + if reliability: + child_reliability = reliability.ReliabilityTool() + else: + child_reliability = NoReliability() child_status = status.Status() child_helper_status = status.HelperStatus() child_statistics = status.Statistics() diff --git a/src/allmydata/web/welcome.xhtml b/src/allmydata/web/welcome.xhtml index d0acb197..2b8c7ad1 100644 --- a/src/allmydata/web/welcome.xhtml +++ b/src/allmydata/web/welcome.xhtml @@ -75,7 +75,8 @@
Please visit the Tahoe home page for code updates and bug reporting.
-
The provisioning tool may also be useful.
+
The provisioning tool and reliability calculator may also be useful.