4 from allmydata.util import statistics
6 from Numeric import array, matrixmultiply as mm
13 #print v1.shape, v2.shape
14 #assert len(v1.shape) == 2
15 #assert v1.shape[0] == 1
16 #assert len(v2.shape) == 2
17 #assert v2.shape[0] == 1
18 #assert v1.shape[1] == v2.shape[1]
19 #for i in range(v1.shape[1]):
20 return Numeric.sum(Numeric.sum(v1*v2))
23 return "%dy.%dm" % (int(seconds/YEAR), int( (seconds%YEAR)/MONTH))
25 class ReliabilityModel:
26 """Generate a model of system-wide reliability, given several input
29 This runs a simulation in which time is quantized down to 'delta' seconds
30 (default is one month): a smaller delta will result in a more accurate
31 simulation, but will take longer to run. 'report_span' simulated seconds
34 The encoding parameters are provided as 'k' (minimum number of shares
35 needed to recover the file) and 'N' (total number of shares generated).
36 The default parameters are 3-of-10.
38 The first step is to build a probability of individual drive loss during
39 any given delta. This uses a simple exponential model, in which the
40 average drive lifetime is specified by the 'drive_lifetime' parameter
43 The second step is to calculate a 'transition matrix': a table of
44 probabilities that shows, given A shares at the start of the delta, what
45 the chances are of having B shares left at the end of the delta. The
46 current code optimistically assumes all drives are independent. A
47 subclass could override that assumption.
49 An additional 'repair matrix' is created to show what happens when the
50 Checker/Repairer is run. In the simulation, the Checker will be run every
51 'check_period' seconds (default is one month), and the Repairer will be
52 run if it sees fewer than 'R' shares (default 7).
54 The third step is to finally run the simulation. An initial probability
55 vector is created (with a 100% chance of N shares and a 0% chance of
56 fewer than N shares), then it is multiplied by the transition matrix for
57 every delta of time. Each time the Checker is to be run, the repair
58 matrix is multiplied in, and some additional stats are accumulated
59 (average number of repairs that occur, average number of shares
60 regenerated per repair).
62 The output is a ReliabilityReport instance, which contains a table that
63 samples the state of the simulation once each 'report_period' seconds
64 (defaults to 3 months). Each row of this table will contain the
65 probability vector for one sample period (chance of having X shares, from
66 0 to N, at the end of the period). The report will also contain other
73 drive_lifetime=8*YEAR,
77 report_period=3*MONTH,
82 check_period = check_period-1
83 P = self.p_in_period(drive_lifetime, delta)
85 decay = self.build_decay_matrix(N, P)
87 repair = self.build_repair_matrix(k, N, R)
89 #print "DECAY:", decay
90 #print "OLD-POST-REPAIR:", old_post_repair
91 #print "NEW-POST-REPAIR:", mm(decay, repair)
92 #print "REPAIR:", repair
93 #print "DIFF:", (old_post_repair - mm(decay, repair))
95 START = array([[0]*N + [1]])
96 ALIVE = array([[0]*k + [1]*(1+N-k)])
97 DEAD = array([[1]*k + [0]*(1+N-k)])
98 REPAIRp = array([[0]*k + [1]*(R-k) + [0]*(1+N-R)])
99 REPAIR_newshares = array([[0]*k +
100 [N-i for i in range(k, R)] +
102 assert REPAIR_newshares.shape[1] == N+1
103 #print "START", START
104 #print "ALIVE", ALIVE
105 #print "REPAIRp", REPAIRp
106 #print "REPAIR_newshares", REPAIR_newshares
108 unmaintained_state = START
109 maintained_state = START
112 P_repaired_last_check_period = 0.0
114 needed_new_shares = []
115 report = ReliabilityReport()
117 for t in range(0, report_span+delta, delta):
118 unmaintained_state = mm(unmaintained_state, decay)
119 maintained_state = mm(maintained_state, decay)
120 if (t-last_check) > check_period:
122 # we do a check-and-repair this frequently
123 need_repair = my_dot(maintained_state, REPAIRp)
125 P_repaired_last_check_period = need_repair
126 new_shares = my_dot(maintained_state, REPAIR_newshares)
127 needed_repairs.append(need_repair)
128 needed_new_shares.append(new_shares)
130 maintained_state = mm(maintained_state, repair)
132 if (t-last_report) > report_period:
134 P_dead_unmaintained = my_dot(unmaintained_state, DEAD)
135 P_dead_maintained = my_dot(maintained_state, DEAD)
136 cumulative_number_of_repairs = sum(needed_repairs)
137 cumulative_number_of_new_shares = sum(needed_new_shares)
138 report.add_sample(t, unmaintained_state, maintained_state,
139 P_repaired_last_check_period,
140 cumulative_number_of_repairs,
141 cumulative_number_of_new_shares,
142 P_dead_unmaintained, P_dead_maintained)
144 # record one more sample at the end of the run
145 P_dead_unmaintained = my_dot(unmaintained_state, DEAD)
146 P_dead_maintained = my_dot(maintained_state, DEAD)
147 cumulative_number_of_repairs = sum(needed_repairs)
148 cumulative_number_of_new_shares = sum(needed_new_shares)
149 report.add_sample(t, unmaintained_state, maintained_state,
150 P_repaired_last_check_period,
151 cumulative_number_of_repairs,
152 cumulative_number_of_new_shares,
153 P_dead_unmaintained, P_dead_maintained)
155 #needed_repairs_total = sum(needed_repairs)
156 #needed_new_shares_total = sum(needed_new_shares)
158 #print " unmaintained", unmaintained_state
159 #print " maintained", maintained_state
160 #print " number of repairs", needed_repairs_total
161 #print " new shares generated", needed_new_shares_total
162 #repair_rate_inv = report_span / needed_repairs_total
163 #print " avg repair rate: once every %s" % yandm(repair_rate_inv)
164 #print " avg repair download: one share every %s" % yandm(repair_rate_inv/k)
165 #print " avg repair upload: one share every %s" % yandm(report_span / needed_new_shares_total)
169 def p_in_period(self, avg_lifetime, period):
170 """Given an average lifetime of a disk (using an exponential model),
171 what is the chance that a live disk will survive the next 'period'
174 # eg p_in_period(8*YEAR, MONTH) = 98.94%
175 return math.exp(-1.0*period/avg_lifetime)
177 def build_decay_matrix(self, N, P):
178 """Return a decay matrix. decay[start_shares][end_shares] is the
179 conditional probability of finishing with end_shares, given that we
180 started with start_shares."""
182 decay_rows.append( [0.0]*(N+1) )
183 for start_shares in range(1, (N+1)):
184 end_shares = self.build_decay_row(start_shares, P)
185 decay_row = end_shares + [0.0] * (N-start_shares)
186 assert len(decay_row) == (N+1), len(decay_row)
187 decay_rows.append(decay_row)
189 decay = array(decay_rows)
192 def build_decay_row(self, start_shares, P):
193 """Return a decay row 'end_shares'. end_shares[i] is the chance that
194 we finish with i shares, given that we started with start_shares, for
195 all i between 0 and start_shares, inclusive. This implementation
196 assumes that all shares are independent (IID), but a more complex
197 model could incorporate inter-share failure correlations like having
198 two shares on the same server."""
199 end_shares = statistics.binomial_distribution_pmf(start_shares, P)
202 def build_repair_matrix(self, k, N, R):
203 """Return a repair matrix. repair[start][end]: is the conditional
204 probability of the repairer finishing with 'end' shares, given that
205 it began with 'start' shares (repair if fewer than R shares). The
206 repairer's behavior is deterministic, so all values in this matrix
207 are either 0 or 1. This matrix should be applied *after* the decay
210 for start_shares in range(0, N+1):
211 new_repair_row = [0] * (N+1)
213 new_repair_row[start_shares] = 1
214 elif start_shares < R:
215 new_repair_row[N] = 1
217 new_repair_row[start_shares] = 1
218 new_repair_rows.append(new_repair_row)
220 repair = array(new_repair_rows)
223 class ReliabilityReport:
227 def add_sample(self, when, unmaintained_shareprobs, maintained_shareprobs,
228 P_repaired_last_check_period,
229 cumulative_number_of_repairs,
230 cumulative_number_of_new_shares,
231 P_dead_unmaintained, P_dead_maintained):
233 when: the timestamp at the end of the report period
234 unmaintained_shareprobs: a vector of probabilities, element[S]
235 is the chance that there are S shares
236 left at the end of the report period.
237 This tracks what happens if no repair
239 maintained_shareprobs: same, but for 'maintained' grids, where
240 check and repair is done at the end
242 P_repaired_last_check_period: a float, with the probability
243 that a repair was performed
244 at the end of the most recent
246 cumulative_number_of_repairs: a float, with the average number
247 of repairs that will have been
248 performed by the end of the
250 cumulative_number_of_new_shares: a float, with the average number
251 of new shares that repair proceses
252 generated by the end of the report
254 P_dead_unmaintained: a float, with the chance that the file will
255 be unrecoverable at the end of the period
256 P_dead_maintained: same, but for maintained grids
259 row = (when, unmaintained_shareprobs, maintained_shareprobs,
260 P_repaired_last_check_period,
261 cumulative_number_of_repairs,
262 cumulative_number_of_new_shares,
263 P_dead_unmaintained, P_dead_maintained)
264 self.samples.append(row)