]> git.rkrishnan.org Git - tahoe-lafs/tahoe-lafs.git/blob - src/allmydata/reliability.py
a0d60769bdec9ca6d0020868854c647aabf44891
[tahoe-lafs/tahoe-lafs.git] / src / allmydata / reliability.py
1 #! /usr/bin/python
2
3 import math
4 from allmydata.util import statistics
5 from numpy import array, matrix, dot
6
7 DAY=24*60*60
8 MONTH=31*DAY
9 YEAR=365*DAY
10
11 class ReliabilityModel:
12     """Generate a model of system-wide reliability, given several input
13     parameters.
14
15     This runs a simulation in which time is quantized down to 'delta' seconds
16     (default is one month): a smaller delta will result in a more accurate
17     simulation, but will take longer to run. 'report_span' simulated seconds
18     will be run.
19
20     The encoding parameters are provided as 'k' (minimum number of shares
21     needed to recover the file) and 'N' (total number of shares generated).
22     The default parameters are 3-of-10.
23
24     The first step is to build a probability of individual drive loss during
25     any given delta. This uses a simple exponential model, in which the
26     average drive lifetime is specified by the 'drive_lifetime' parameter
27     (default is 8 years).
28
29     The second step is to calculate a 'transition matrix': a table of
30     probabilities that shows, given A shares at the start of the delta, what
31     the chances are of having B shares left at the end of the delta. The
32     current code optimistically assumes all drives are independent. A
33     subclass could override that assumption.
34
35     An additional 'repair matrix' is created to show what happens when the
36     Checker/Repairer is run. In the simulation, the Checker will be run every
37     'check_period' seconds (default is one month), and the Repairer will be
38     run if it sees fewer than 'R' shares (default 7).
39
40     The third step is to finally run the simulation. An initial probability
41     vector is created (with a 100% chance of N shares and a 0% chance of
42     fewer than N shares), then it is multiplied by the transition matrix for
43     every delta of time. Each time the Checker is to be run, the repair
44     matrix is multiplied in, and some additional stats are accumulated
45     (average number of repairs that occur, average number of shares
46     regenerated per repair).
47
48     The output is a ReliabilityReport instance, which contains a table that
49     samples the state of the simulation once each 'report_period' seconds
50     (defaults to 3 months). Each row of this table will contain the
51     probability vector for one sample period (chance of having X shares, from
52     0 to N, at the end of the period). The report will also contain other
53     information.
54
55     """
56
57     @classmethod
58     def run(klass,
59             drive_lifetime=8*YEAR,
60             k=3, R=7, N=10,
61             delta=1*MONTH,
62             check_period=1*MONTH,
63             report_period=3*MONTH,
64             report_span=5*YEAR,
65             ):
66         self = klass()
67
68         check_period = check_period-1
69         P = self.p_in_period(drive_lifetime, delta)
70
71         decay = self.build_decay_matrix(N, P)
72
73         repair = self.build_repair_matrix(k, N, R)
74
75         #print "DECAY:", decay
76         #print "OLD-POST-REPAIR:", old_post_repair
77         #print "NEW-POST-REPAIR:", decay * repair
78         #print "REPAIR:", repair
79         #print "DIFF:", (old_post_repair - decay * repair)
80
81         START = array([0]*N + [1])
82         DEAD = array([1]*k + [0]*(1+N-k))
83         REPAIRp = array([0]*k + [1]*(R-k) + [0]*(1+N-R))
84         REPAIR_newshares = array([0]*k +
85                                  [N-i for i in range(k, R)] +
86                                  [0]*(1+N-R))
87         assert REPAIR_newshares.shape[0] == N+1
88         #print "START", START
89         #print "REPAIRp", REPAIRp
90         #print "REPAIR_newshares", REPAIR_newshares
91
92         unmaintained_state = START
93         maintained_state = START
94         last_check = 0
95         last_report = 0
96         P_repaired_last_check_period = 0.0
97         needed_repairs = []
98         needed_new_shares = []
99         report = ReliabilityReport()
100
101         for t in range(0, report_span+delta, delta):
102             # the .A[0] turns the one-row matrix back into an array
103             unmaintained_state = (unmaintained_state * decay).A[0]
104             maintained_state = (maintained_state * decay).A[0]
105             if (t-last_check) > check_period:
106                 last_check = t
107                 # we do a check-and-repair this frequently
108                 need_repair = dot(maintained_state, REPAIRp)
109
110                 P_repaired_last_check_period = need_repair
111                 new_shares = dot(maintained_state, REPAIR_newshares)
112                 needed_repairs.append(need_repair)
113                 needed_new_shares.append(new_shares)
114
115                 maintained_state = (maintained_state * repair).A[0]
116
117             if (t-last_report) > report_period:
118                 last_report = t
119                 P_dead_unmaintained = dot(unmaintained_state, DEAD)
120                 P_dead_maintained = dot(maintained_state, DEAD)
121                 cumulative_number_of_repairs = sum(needed_repairs)
122                 cumulative_number_of_new_shares = sum(needed_new_shares)
123                 report.add_sample(t, unmaintained_state, maintained_state,
124                                   P_repaired_last_check_period,
125                                   cumulative_number_of_repairs,
126                                   cumulative_number_of_new_shares,
127                                   P_dead_unmaintained, P_dead_maintained)
128
129         # record one more sample at the end of the run
130         P_dead_unmaintained = dot(unmaintained_state, DEAD)
131         P_dead_maintained = dot(maintained_state, DEAD)
132         cumulative_number_of_repairs = sum(needed_repairs)
133         cumulative_number_of_new_shares = sum(needed_new_shares)
134         report.add_sample(t, unmaintained_state, maintained_state,
135                           P_repaired_last_check_period,
136                           cumulative_number_of_repairs,
137                           cumulative_number_of_new_shares,
138                           P_dead_unmaintained, P_dead_maintained)
139
140         #def yandm(seconds):
141         #    return "%dy.%dm" % (int(seconds/YEAR), int( (seconds%YEAR)/MONTH))
142         #needed_repairs_total = sum(needed_repairs)
143         #needed_new_shares_total = sum(needed_new_shares)
144         #print "at 2y:"
145         #print " unmaintained", unmaintained_state
146         #print " maintained", maintained_state
147         #print " number of repairs", needed_repairs_total
148         #print " new shares generated", needed_new_shares_total
149         #repair_rate_inv = report_span / needed_repairs_total
150         #print "  avg repair rate: once every %s" % yandm(repair_rate_inv)
151         #print "  avg repair download: one share every %s" % yandm(repair_rate_inv/k)
152         #print "  avg repair upload: one share every %s" % yandm(report_span / needed_new_shares_total)
153
154         return report
155
156     def p_in_period(self, avg_lifetime, period):
157         """Given an average lifetime of a disk (using an exponential model),
158         what is the chance that a live disk will survive the next 'period'
159         seconds?"""
160
161         # eg p_in_period(8*YEAR, MONTH) = 98.94%
162         return math.exp(-1.0*period/avg_lifetime)
163
164     def build_decay_matrix(self, N, P):
165         """Return a decay matrix. decay[start_shares][end_shares] is the
166         conditional probability of finishing with end_shares, given that we
167         started with start_shares."""
168         decay_rows = []
169         decay_rows.append( [0.0]*(N+1) )
170         for start_shares in range(1, (N+1)):
171             end_shares = self.build_decay_row(start_shares, P)
172             decay_row = end_shares + [0.0] * (N-start_shares)
173             assert len(decay_row) == (N+1), len(decay_row)
174             decay_rows.append(decay_row)
175
176         decay = matrix(decay_rows)
177         return decay
178
179     def build_decay_row(self, start_shares, P):
180         """Return a decay row 'end_shares'. end_shares[i] is the chance that
181         we finish with i shares, given that we started with start_shares, for
182         all i between 0 and start_shares, inclusive. This implementation
183         assumes that all shares are independent (IID), but a more complex
184         model could incorporate inter-share failure correlations like having
185         two shares on the same server."""
186         end_shares = statistics.binomial_distribution_pmf(start_shares, P)
187         return end_shares
188
189     def build_repair_matrix(self, k, N, R):
190         """Return a repair matrix. repair[start][end]: is the conditional
191         probability of the repairer finishing with 'end' shares, given that
192         it began with 'start' shares (repair if fewer than R shares). The
193         repairer's behavior is deterministic, so all values in this matrix
194         are either 0 or 1. This matrix should be applied *after* the decay
195         matrix."""
196         new_repair_rows = []
197         for start_shares in range(0, N+1):
198             new_repair_row = [0] * (N+1)
199             if start_shares < k:
200                 new_repair_row[start_shares] = 1
201             elif start_shares < R:
202                 new_repair_row[N] = 1
203             else:
204                 new_repair_row[start_shares] = 1
205             new_repair_rows.append(new_repair_row)
206
207         repair = matrix(new_repair_rows)
208         return repair
209
210 class ReliabilityReport:
211     def __init__(self):
212         self.samples = []
213
214     def add_sample(self, when, unmaintained_shareprobs, maintained_shareprobs,
215                    P_repaired_last_check_period,
216                    cumulative_number_of_repairs,
217                    cumulative_number_of_new_shares,
218                    P_dead_unmaintained, P_dead_maintained):
219         """
220         when: the timestamp at the end of the report period
221         unmaintained_shareprobs: a vector of probabilities, element[S]
222                                  is the chance that there are S shares
223                                  left at the end of the report period.
224                                  This tracks what happens if no repair
225                                  is ever done.
226         maintained_shareprobs: same, but for 'maintained' grids, where
227                                check and repair is done at the end
228                                of each check period
229         P_repaired_last_check_period: a float, with the probability
230                                       that a repair was performed
231                                       at the end of the most recent
232                                       check period.
233         cumulative_number_of_repairs: a float, with the average number
234                                       of repairs that will have been
235                                       performed by the end of the
236                                       report period
237         cumulative_number_of_new_shares: a float, with the average number
238                                          of new shares that repair proceses
239                                          generated by the end of the report
240                                          period
241         P_dead_unmaintained: a float, with the chance that the file will
242                              be unrecoverable at the end of the period
243         P_dead_maintained: same, but for maintained grids
244
245         """
246         row = (when, unmaintained_shareprobs, maintained_shareprobs,
247                P_repaired_last_check_period,
248                cumulative_number_of_repairs,
249                cumulative_number_of_new_shares,
250                P_dead_unmaintained, P_dead_maintained)
251         self.samples.append(row)