4 from allmydata.util import statistics
5 from numpy import array, matrix, dot
11 class ReliabilityModel:
12 """Generate a model of system-wide reliability, given several input
15 This runs a simulation in which time is quantized down to 'delta' seconds
16 (default is one month): a smaller delta will result in a more accurate
17 simulation, but will take longer to run. 'report_span' simulated seconds
20 The encoding parameters are provided as 'k' (minimum number of shares
21 needed to recover the file) and 'N' (total number of shares generated).
22 The default parameters are 3-of-10.
24 The first step is to build a probability of individual drive loss during
25 any given delta. This uses a simple exponential model, in which the
26 average drive lifetime is specified by the 'drive_lifetime' parameter
29 The second step is to calculate a 'transition matrix': a table of
30 probabilities that shows, given A shares at the start of the delta, what
31 the chances are of having B shares left at the end of the delta. The
32 current code optimistically assumes all drives are independent. A
33 subclass could override that assumption.
35 An additional 'repair matrix' is created to show what happens when the
36 Checker/Repairer is run. In the simulation, the Checker will be run every
37 'check_period' seconds (default is one month), and the Repairer will be
38 run if it sees fewer than 'R' shares (default 7).
40 The third step is to finally run the simulation. An initial probability
41 vector is created (with a 100% chance of N shares and a 0% chance of
42 fewer than N shares), then it is multiplied by the transition matrix for
43 every delta of time. Each time the Checker is to be run, the repair
44 matrix is multiplied in, and some additional stats are accumulated
45 (average number of repairs that occur, average number of shares
46 regenerated per repair).
48 The output is a ReliabilityReport instance, which contains a table that
49 samples the state of the simulation once each 'report_period' seconds
50 (defaults to 3 months). Each row of this table will contain the
51 probability vector for one sample period (chance of having X shares, from
52 0 to N, at the end of the period). The report will also contain other
59 drive_lifetime=8*YEAR,
63 report_period=3*MONTH,
68 check_period = check_period-1
69 P = self.p_in_period(drive_lifetime, delta)
71 decay = self.build_decay_matrix(N, P)
73 repair = self.build_repair_matrix(k, N, R)
75 #print "DECAY:", decay
76 #print "OLD-POST-REPAIR:", old_post_repair
77 #print "NEW-POST-REPAIR:", decay * repair
78 #print "REPAIR:", repair
79 #print "DIFF:", (old_post_repair - decay * repair)
81 START = array([0]*N + [1])
82 DEAD = array([1]*k + [0]*(1+N-k))
83 REPAIRp = array([0]*k + [1]*(R-k) + [0]*(1+N-R))
84 REPAIR_newshares = array([0]*k +
85 [N-i for i in range(k, R)] +
87 assert REPAIR_newshares.shape[0] == N+1
89 #print "REPAIRp", REPAIRp
90 #print "REPAIR_newshares", REPAIR_newshares
92 unmaintained_state = START
93 maintained_state = START
96 P_repaired_last_check_period = 0.0
98 needed_new_shares = []
99 report = ReliabilityReport()
101 for t in range(0, report_span+delta, delta):
102 # the .A[0] turns the one-row matrix back into an array
103 unmaintained_state = (unmaintained_state * decay).A[0]
104 maintained_state = (maintained_state * decay).A[0]
105 if (t-last_check) > check_period:
107 # we do a check-and-repair this frequently
108 need_repair = dot(maintained_state, REPAIRp)
110 P_repaired_last_check_period = need_repair
111 new_shares = dot(maintained_state, REPAIR_newshares)
112 needed_repairs.append(need_repair)
113 needed_new_shares.append(new_shares)
115 maintained_state = (maintained_state * repair).A[0]
117 if (t-last_report) > report_period:
119 P_dead_unmaintained = dot(unmaintained_state, DEAD)
120 P_dead_maintained = dot(maintained_state, DEAD)
121 cumulative_number_of_repairs = sum(needed_repairs)
122 cumulative_number_of_new_shares = sum(needed_new_shares)
123 report.add_sample(t, unmaintained_state, maintained_state,
124 P_repaired_last_check_period,
125 cumulative_number_of_repairs,
126 cumulative_number_of_new_shares,
127 P_dead_unmaintained, P_dead_maintained)
129 # record one more sample at the end of the run
130 P_dead_unmaintained = dot(unmaintained_state, DEAD)
131 P_dead_maintained = dot(maintained_state, DEAD)
132 cumulative_number_of_repairs = sum(needed_repairs)
133 cumulative_number_of_new_shares = sum(needed_new_shares)
134 report.add_sample(t, unmaintained_state, maintained_state,
135 P_repaired_last_check_period,
136 cumulative_number_of_repairs,
137 cumulative_number_of_new_shares,
138 P_dead_unmaintained, P_dead_maintained)
141 # return "%dy.%dm" % (int(seconds/YEAR), int( (seconds%YEAR)/MONTH))
142 #needed_repairs_total = sum(needed_repairs)
143 #needed_new_shares_total = sum(needed_new_shares)
145 #print " unmaintained", unmaintained_state
146 #print " maintained", maintained_state
147 #print " number of repairs", needed_repairs_total
148 #print " new shares generated", needed_new_shares_total
149 #repair_rate_inv = report_span / needed_repairs_total
150 #print " avg repair rate: once every %s" % yandm(repair_rate_inv)
151 #print " avg repair download: one share every %s" % yandm(repair_rate_inv/k)
152 #print " avg repair upload: one share every %s" % yandm(report_span / needed_new_shares_total)
156 def p_in_period(self, avg_lifetime, period):
157 """Given an average lifetime of a disk (using an exponential model),
158 what is the chance that a live disk will survive the next 'period'
161 # eg p_in_period(8*YEAR, MONTH) = 98.94%
162 return math.exp(-1.0*period/avg_lifetime)
164 def build_decay_matrix(self, N, P):
165 """Return a decay matrix. decay[start_shares][end_shares] is the
166 conditional probability of finishing with end_shares, given that we
167 started with start_shares."""
169 decay_rows.append( [0.0]*(N+1) )
170 for start_shares in range(1, (N+1)):
171 end_shares = self.build_decay_row(start_shares, P)
172 decay_row = end_shares + [0.0] * (N-start_shares)
173 assert len(decay_row) == (N+1), len(decay_row)
174 decay_rows.append(decay_row)
176 decay = matrix(decay_rows)
179 def build_decay_row(self, start_shares, P):
180 """Return a decay row 'end_shares'. end_shares[i] is the chance that
181 we finish with i shares, given that we started with start_shares, for
182 all i between 0 and start_shares, inclusive. This implementation
183 assumes that all shares are independent (IID), but a more complex
184 model could incorporate inter-share failure correlations like having
185 two shares on the same server."""
186 end_shares = statistics.binomial_distribution_pmf(start_shares, P)
189 def build_repair_matrix(self, k, N, R):
190 """Return a repair matrix. repair[start][end]: is the conditional
191 probability of the repairer finishing with 'end' shares, given that
192 it began with 'start' shares (repair if fewer than R shares). The
193 repairer's behavior is deterministic, so all values in this matrix
194 are either 0 or 1. This matrix should be applied *after* the decay
197 for start_shares in range(0, N+1):
198 new_repair_row = [0] * (N+1)
200 new_repair_row[start_shares] = 1
201 elif start_shares < R:
202 new_repair_row[N] = 1
204 new_repair_row[start_shares] = 1
205 new_repair_rows.append(new_repair_row)
207 repair = matrix(new_repair_rows)
210 class ReliabilityReport:
214 def add_sample(self, when, unmaintained_shareprobs, maintained_shareprobs,
215 P_repaired_last_check_period,
216 cumulative_number_of_repairs,
217 cumulative_number_of_new_shares,
218 P_dead_unmaintained, P_dead_maintained):
220 when: the timestamp at the end of the report period
221 unmaintained_shareprobs: a vector of probabilities, element[S]
222 is the chance that there are S shares
223 left at the end of the report period.
224 This tracks what happens if no repair
226 maintained_shareprobs: same, but for 'maintained' grids, where
227 check and repair is done at the end
229 P_repaired_last_check_period: a float, with the probability
230 that a repair was performed
231 at the end of the most recent
233 cumulative_number_of_repairs: a float, with the average number
234 of repairs that will have been
235 performed by the end of the
237 cumulative_number_of_new_shares: a float, with the average number
238 of new shares that repair proceses
239 generated by the end of the report
241 P_dead_unmaintained: a float, with the chance that the file will
242 be unrecoverable at the end of the period
243 P_dead_maintained: same, but for maintained grids
246 row = (when, unmaintained_shareprobs, maintained_shareprobs,
247 P_repaired_last_check_period,
248 cumulative_number_of_repairs,
249 cumulative_number_of_new_shares,
250 P_dead_unmaintained, P_dead_maintained)
251 self.samples.append(row)