]> git.rkrishnan.org Git - tahoe-lafs/tahoe-lafs.git/blob - src/allmydata/reliability.py
build a 'reliability' web page, with a simulation of file decay and repair over time
[tahoe-lafs/tahoe-lafs.git] / src / allmydata / reliability.py
1 #! /usr/bin/python
2
3 import math
4 from allmydata.util import statistics
5 import Numeric
6 from Numeric import array, matrixmultiply as mm
7
8 DAY=24*60*60
9 MONTH=31*DAY
10 YEAR=365*DAY
11
12 def my_dot(v1, v2):
13     #print v1.shape, v2.shape
14     #assert len(v1.shape) == 2
15     #assert v1.shape[0] == 1
16     #assert len(v2.shape) == 2
17     #assert v2.shape[0] == 1
18     #assert v1.shape[1] == v2.shape[1]
19     #for i in range(v1.shape[1]):
20     return Numeric.sum(Numeric.sum(v1*v2))
21
22 def yandm(seconds):
23     return "%dy.%dm" % (int(seconds/YEAR), int( (seconds%YEAR)/MONTH))
24
25 class ReliabilityModel:
26     """Generate a model of system-wide reliability, given several input
27     parameters.
28
29     This runs a simulation in which time is quantized down to 'delta' seconds
30     (default is one month): a smaller delta will result in a more accurate
31     simulation, but will take longer to run. 'report_span' simulated seconds
32     will be run.
33
34     The encoding parameters are provided as 'k' (minimum number of shares
35     needed to recover the file) and 'N' (total number of shares generated).
36     The default parameters are 3-of-10.
37
38     The first step is to build a probability of individual drive loss during
39     any given delta. This uses a simple exponential model, in which the
40     average drive lifetime is specified by the 'drive_lifetime' parameter
41     (default is 8 years).
42
43     The second step is to calculate a 'transition matrix': a table of
44     probabilities that shows, given A shares at the start of the delta, what
45     the chances are of having B shares left at the end of the delta. The
46     current code optimistically assumes all drives are independent. A
47     subclass could override that assumption.
48
49     An additional 'repair matrix' is created to show what happens when the
50     Checker/Repairer is run. In the simulation, the Checker will be run every
51     'check_period' seconds (default is one month), and the Repairer will be
52     run if it sees fewer than 'R' shares (default 7).
53
54     The third step is to finally run the simulation. An initial probability
55     vector is created (with a 100% chance of N shares and a 0% chance of
56     fewer than N shares), then it is multiplied by the transition matrix for
57     every delta of time. Each time the Checker is to be run, the repair
58     matrix is multiplied in, and some additional stats are accumulated
59     (average number of repairs that occur, average number of shares
60     regenerated per repair).
61
62     The output is a ReliabilityReport instance, which contains a table that
63     samples the state of the simulation once each 'report_period' seconds
64     (defaults to 3 months). Each row of this table will contain the
65     probability vector for one sample period (chance of having X shares, from
66     0 to N, at the end of the period). The report will also contain other
67     information.
68
69     """
70
71     @classmethod
72     def run(klass,
73             drive_lifetime=8*YEAR,
74             k=3, R=7, N=10,
75             delta=1*MONTH,
76             check_period=1*MONTH,
77             report_period=3*MONTH,
78             report_span=5*YEAR,
79             ):
80         self = klass()
81
82         check_period = check_period-1
83         P = self.p_in_period(drive_lifetime, delta)
84
85         decay = self.build_decay_matrix(N, P)
86
87         repair = self.build_repair_matrix(k, N, R)
88
89         #print "DECAY:", decay
90         #print "OLD-POST-REPAIR:", old_post_repair
91         #print "NEW-POST-REPAIR:", mm(decay, repair)
92         #print "REPAIR:", repair
93         #print "DIFF:", (old_post_repair - mm(decay, repair))
94
95         START = array([[0]*N + [1]])
96         ALIVE = array([[0]*k + [1]*(1+N-k)])
97         DEAD = array([[1]*k + [0]*(1+N-k)])
98         REPAIRp = array([[0]*k + [1]*(R-k) + [0]*(1+N-R)])
99         REPAIR_newshares = array([[0]*k +
100                                   [N-i for i in range(k, R)] +
101                                   [0]*(1+N-R)])
102         assert REPAIR_newshares.shape[1] == N+1
103         #print "START", START
104         #print "ALIVE", ALIVE
105         #print "REPAIRp", REPAIRp
106         #print "REPAIR_newshares", REPAIR_newshares
107
108         unmaintained_state = START
109         maintained_state = START
110         last_check = 0
111         last_report = 0
112         P_repaired_last_check_period = 0.0
113         needed_repairs = []
114         needed_new_shares = []
115         report = ReliabilityReport()
116
117         for t in range(0, report_span+delta, delta):
118             unmaintained_state = mm(unmaintained_state, decay)
119             maintained_state = mm(maintained_state, decay)
120             if (t-last_check) > check_period:
121                 last_check = t
122                 # we do a check-and-repair this frequently
123                 need_repair = my_dot(maintained_state, REPAIRp)
124
125                 P_repaired_last_check_period = need_repair
126                 new_shares = my_dot(maintained_state, REPAIR_newshares)
127                 needed_repairs.append(need_repair)
128                 needed_new_shares.append(new_shares)
129
130                 maintained_state = mm(maintained_state, repair)
131
132             if (t-last_report) > report_period:
133                 last_report = t
134                 P_dead_unmaintained = my_dot(unmaintained_state, DEAD)
135                 P_dead_maintained = my_dot(maintained_state, DEAD)
136                 cumulative_number_of_repairs = sum(needed_repairs)
137                 cumulative_number_of_new_shares = sum(needed_new_shares)
138                 report.add_sample(t, unmaintained_state, maintained_state,
139                                   P_repaired_last_check_period,
140                                   cumulative_number_of_repairs,
141                                   cumulative_number_of_new_shares,
142                                   P_dead_unmaintained, P_dead_maintained)
143
144         # record one more sample at the end of the run
145         P_dead_unmaintained = my_dot(unmaintained_state, DEAD)
146         P_dead_maintained = my_dot(maintained_state, DEAD)
147         cumulative_number_of_repairs = sum(needed_repairs)
148         cumulative_number_of_new_shares = sum(needed_new_shares)
149         report.add_sample(t, unmaintained_state, maintained_state,
150                           P_repaired_last_check_period,
151                           cumulative_number_of_repairs,
152                           cumulative_number_of_new_shares,
153                           P_dead_unmaintained, P_dead_maintained)
154
155         #needed_repairs_total = sum(needed_repairs)
156         #needed_new_shares_total = sum(needed_new_shares)
157         #print "at 2y:"
158         #print " unmaintained", unmaintained_state
159         #print " maintained", maintained_state
160         #print " number of repairs", needed_repairs_total
161         #print " new shares generated", needed_new_shares_total
162         #repair_rate_inv = report_span / needed_repairs_total
163         #print "  avg repair rate: once every %s" % yandm(repair_rate_inv)
164         #print "  avg repair download: one share every %s" % yandm(repair_rate_inv/k)
165         #print "  avg repair upload: one share every %s" % yandm(report_span / needed_new_shares_total)
166
167         return report
168
169     def p_in_period(self, avg_lifetime, period):
170         """Given an average lifetime of a disk (using an exponential model),
171         what is the chance that a live disk will survive the next 'period'
172         seconds?"""
173
174         # eg p_in_period(8*YEAR, MONTH) = 98.94%
175         return math.exp(-1.0*period/avg_lifetime)
176
177     def build_decay_matrix(self, N, P):
178         """Return a decay matrix. decay[start_shares][end_shares] is the
179         conditional probability of finishing with end_shares, given that we
180         started with start_shares."""
181         decay_rows = []
182         decay_rows.append( [0.0]*(N+1) )
183         for start_shares in range(1, (N+1)):
184             end_shares = self.build_decay_row(start_shares, P)
185             decay_row = end_shares + [0.0] * (N-start_shares)
186             assert len(decay_row) == (N+1), len(decay_row)
187             decay_rows.append(decay_row)
188
189         decay = array(decay_rows)
190         return decay
191
192     def build_decay_row(self, start_shares, P):
193         """Return a decay row 'end_shares'. end_shares[i] is the chance that
194         we finish with i shares, given that we started with start_shares, for
195         all i between 0 and start_shares, inclusive. This implementation
196         assumes that all shares are independent (IID), but a more complex
197         model could incorporate inter-share failure correlations like having
198         two shares on the same server."""
199         end_shares = statistics.binomial_distribution_pmf(start_shares, P)
200         return end_shares
201
202     def build_repair_matrix(self, k, N, R):
203         """Return a repair matrix. repair[start][end]: is the conditional
204         probability of the repairer finishing with 'end' shares, given that
205         it began with 'start' shares (repair if fewer than R shares). The
206         repairer's behavior is deterministic, so all values in this matrix
207         are either 0 or 1. This matrix should be applied *after* the decay
208         matrix."""
209         new_repair_rows = []
210         for start_shares in range(0, N+1):
211             new_repair_row = [0] * (N+1)
212             if start_shares < k:
213                 new_repair_row[start_shares] = 1
214             elif start_shares < R:
215                 new_repair_row[N] = 1
216             else:
217                 new_repair_row[start_shares] = 1
218             new_repair_rows.append(new_repair_row)
219
220         repair = array(new_repair_rows)
221         return repair
222
223 class ReliabilityReport:
224     def __init__(self):
225         self.samples = []
226
227     def add_sample(self, when, unmaintained_shareprobs, maintained_shareprobs,
228                    P_repaired_last_check_period,
229                    cumulative_number_of_repairs,
230                    cumulative_number_of_new_shares,
231                    P_dead_unmaintained, P_dead_maintained):
232         """
233         when: the timestamp at the end of the report period
234         unmaintained_shareprobs: a vector of probabilities, element[S]
235                                  is the chance that there are S shares
236                                  left at the end of the report period.
237                                  This tracks what happens if no repair
238                                  is ever done.
239         maintained_shareprobs: same, but for 'maintained' grids, where
240                                check and repair is done at the end
241                                of each check period
242         P_repaired_last_check_period: a float, with the probability
243                                       that a repair was performed
244                                       at the end of the most recent
245                                       check period.
246         cumulative_number_of_repairs: a float, with the average number
247                                       of repairs that will have been
248                                       performed by the end of the
249                                       report period
250         cumulative_number_of_new_shares: a float, with the average number
251                                          of new shares that repair proceses
252                                          generated by the end of the report
253                                          period
254         P_dead_unmaintained: a float, with the chance that the file will
255                              be unrecoverable at the end of the period
256         P_dead_maintained: same, but for maintained grids
257
258         """
259         row = (when, unmaintained_shareprobs, maintained_shareprobs,
260                P_repaired_last_check_period,
261                cumulative_number_of_repairs,
262                cumulative_number_of_new_shares,
263                P_dead_unmaintained, P_dead_maintained)
264         self.samples.append(row)