]> git.rkrishnan.org Git - tahoe-lafs/tahoe-lafs.git/blob - misc/operations_helpers/provisioning/provisioning.py
Add nickname/nodeid to storage-status web page. Closes #1204.
[tahoe-lafs/tahoe-lafs.git] / misc / operations_helpers / provisioning / provisioning.py
1
2 from nevow import inevow, rend, loaders, tags as T
3 import math
4 import util
5
6 # factorial and binomial copied from
7 # http://mail.python.org/pipermail/python-list/2007-April/435718.html
8
9 def div_ceil(n, d):
10     """
11     The smallest integer k such that k*d >= n.
12     """
13     return (n/d) + (n%d != 0)
14
15 def factorial(n):
16     """factorial(n): return the factorial of the integer n.
17     factorial(0) = 1
18     factorial(n) with n<0 is -factorial(abs(n))
19     """
20     result = 1
21     for i in xrange(1, abs(n)+1):
22         result *= i
23     assert n >= 0
24     return result
25
26 def binomial(n, k):
27     assert 0 <= k <= n
28     if k == 0 or k == n:
29         return 1
30     # calculate n!/k! as one product, avoiding factors that
31     # just get canceled
32     P = k+1
33     for i in xrange(k+2, n+1):
34         P *= i
35     # if you are paranoid:
36     # C, rem = divmod(P, factorial(n-k))
37     # assert rem == 0
38     # return C
39     return P//factorial(n-k)
40
41 class ProvisioningTool(rend.Page):
42     addSlash = True
43     docFactory = loaders.xmlfile(util.sibling("provisioning.xhtml"))
44
45     def render_forms(self, ctx, data):
46         req = inevow.IRequest(ctx)
47
48         def getarg(name, astype=int):
49             if req.method != "POST":
50                 return None
51             if name in req.fields:
52                 return astype(req.fields[name].value)
53             return None
54         return self.do_forms(getarg)
55
56
57     def do_forms(self, getarg):
58         filled = getarg("filled", bool)
59
60         def get_and_set(name, options, default=None, astype=int):
61             current_value = getarg(name, astype)
62             i_select = T.select(name=name)
63             for (count, description) in options:
64                 count = astype(count)
65                 if ((current_value is not None and count == current_value) or
66                     (current_value is None and count == default)):
67                     o = T.option(value=str(count), selected="true")[description]
68                 else:
69                     o = T.option(value=str(count))[description]
70                 i_select = i_select[o]
71             if current_value is None:
72                 current_value = default
73             return current_value, i_select
74
75         sections = {}
76         def add_input(section, text, entry):
77             if section not in sections:
78                 sections[section] = []
79             sections[section].extend([T.div[text, ": ", entry], "\n"])
80
81         def add_output(section, entry):
82             if section not in sections:
83                 sections[section] = []
84             sections[section].extend([entry, "\n"])
85
86         def build_section(section):
87             return T.fieldset[T.legend[section], sections[section]]
88
89         def number(value, suffix=""):
90             scaling = 1
91             if value < 1:
92                 fmt = "%1.2g%s"
93             elif value < 100:
94                 fmt = "%.1f%s"
95             elif value < 1000:
96                 fmt = "%d%s"
97             elif value < 1e6:
98                 fmt = "%.2fk%s"; scaling = 1e3
99             elif value < 1e9:
100                 fmt = "%.2fM%s"; scaling = 1e6
101             elif value < 1e12:
102                 fmt = "%.2fG%s"; scaling = 1e9
103             elif value < 1e15:
104                 fmt = "%.2fT%s"; scaling = 1e12
105             elif value < 1e18:
106                 fmt = "%.2fP%s"; scaling = 1e15
107             else:
108                 fmt = "huge! %g%s"
109             return fmt % (value / scaling, suffix)
110
111         user_counts = [(5, "5 users"),
112                        (50, "50 users"),
113                        (200, "200 users"),
114                        (1000, "1k users"),
115                        (10000, "10k users"),
116                        (50000, "50k users"),
117                        (100000, "100k users"),
118                        (500000, "500k users"),
119                        (1000000, "1M users"),
120                        ]
121         num_users, i_num_users = get_and_set("num_users", user_counts, 50000)
122         add_input("Users",
123                   "How many users are on this network?", i_num_users)
124
125         files_per_user_counts = [(100, "100 files"),
126                                  (1000, "1k files"),
127                                  (10000, "10k files"),
128                                  (100000, "100k files"),
129                                  (1e6, "1M files"),
130                                  ]
131         files_per_user, i_files_per_user = get_and_set("files_per_user",
132                                                        files_per_user_counts,
133                                                        1000)
134         add_input("Users",
135                   "How many files for each user? (avg)",
136                   i_files_per_user)
137
138         space_per_user_sizes = [(1e6, "1MB"),
139                                 (10e6, "10MB"),
140                                 (100e6, "100MB"),
141                                 (200e6, "200MB"),
142                                 (1e9, "1GB"),
143                                 (2e9, "2GB"),
144                                 (5e9, "5GB"),
145                                 (10e9, "10GB"),
146                                 (100e9, "100GB"),
147                                 (1e12, "1TB"),
148                                 (2e12, "2TB"),
149                                 (5e12, "5TB"),
150                                 ]
151         # Estimate ~5gb per user as a more realistic case
152         space_per_user, i_space_per_user = get_and_set("space_per_user",
153                                                        space_per_user_sizes,
154                                                        5e9)
155         add_input("Users",
156                   "How much data for each user? (avg)",
157                   i_space_per_user)
158
159         sharing_ratios = [(1.0, "1.0x"),
160                           (1.1, "1.1x"),
161                           (2.0, "2.0x"),
162                           ]
163         sharing_ratio, i_sharing_ratio = get_and_set("sharing_ratio",
164                                                      sharing_ratios, 1.0,
165                                                      float)
166         add_input("Users",
167                   "What is the sharing ratio? (1.0x is no-sharing and"
168                   " no convergence)", i_sharing_ratio)
169
170         # Encoding parameters
171         encoding_choices = [("3-of-10-5", "3.3x (3-of-10, repair below 5)"),
172                             ("3-of-10-8", "3.3x (3-of-10, repair below 8)"),
173                             ("5-of-10-7", "2x (5-of-10, repair below 7)"),
174                             ("8-of-10-9", "1.25x (8-of-10, repair below 9)"),
175                             ("27-of-30-28", "1.1x (27-of-30, repair below 28"),
176                             ("25-of-100-50", "4x (25-of-100, repair below 50)"),
177                             ]
178         encoding_parameters, i_encoding_parameters = \
179                              get_and_set("encoding_parameters",
180                                          encoding_choices, "3-of-10-5", str)
181         encoding_pieces = encoding_parameters.split("-")
182         k = int(encoding_pieces[0])
183         assert encoding_pieces[1] == "of"
184         n = int(encoding_pieces[2])
185         # we repair the file when the number of available shares drops below
186         # this value
187         repair_threshold = int(encoding_pieces[3])
188
189         add_input("Servers",
190                   "What are the default encoding parameters?",
191                   i_encoding_parameters)
192
193         # Server info
194         num_server_choices = [ (5, "5 servers"),
195                                (10, "10 servers"),
196                                (15, "15 servers"),
197                                (30, "30 servers"),
198                                (50, "50 servers"),
199                                (100, "100 servers"),
200                                (200, "200 servers"),
201                                (300, "300 servers"),
202                                (500, "500 servers"),
203                                (1000, "1k servers"),
204                                (2000, "2k servers"),
205                                (5000, "5k servers"),
206                                (10e3, "10k servers"),
207                                (100e3, "100k servers"),
208                                (1e6, "1M servers"),
209                                ]
210         num_servers, i_num_servers = \
211                      get_and_set("num_servers", num_server_choices, 30, int)
212         add_input("Servers",
213                   "How many servers are there?", i_num_servers)
214
215         # availability is measured in dBA = -dBF, where 0dBF is 100% failure,
216         # 10dBF is 10% failure, 20dBF is 1% failure, etc
217         server_dBA_choices = [ (10, "90% [10dBA] (2.4hr/day)"),
218                                (13, "95% [13dBA] (1.2hr/day)"),
219                                (20, "99% [20dBA] (14min/day or 3.5days/year)"),
220                                (23, "99.5% [23dBA] (7min/day or 1.75days/year)"),
221                                (30, "99.9% [30dBA] (87sec/day or 9hours/year)"),
222                                (40, "99.99% [40dBA] (60sec/week or 53min/year)"),
223                                (50, "99.999% [50dBA] (5min per year)"),
224                                ]
225         server_dBA, i_server_availability = \
226                     get_and_set("server_availability",
227                                 server_dBA_choices,
228                                 20, int)
229         add_input("Servers",
230                   "What is the server availability?", i_server_availability)
231
232         drive_MTBF_choices = [ (40, "40,000 Hours"),
233                                ]
234         drive_MTBF, i_drive_MTBF = \
235                     get_and_set("drive_MTBF", drive_MTBF_choices, 40, int)
236         add_input("Drives",
237                   "What is the hard drive MTBF?", i_drive_MTBF)
238         # http://www.tgdaily.com/content/view/30990/113/
239         # http://labs.google.com/papers/disk_failures.pdf
240         # google sees:
241         #  1.7% of the drives they replaced were 0-1 years old
242         #  8% of the drives they repalced were 1-2 years old
243         #  8.6% were 2-3 years old
244         #  6% were 3-4 years old, about 8% were 4-5 years old
245
246         drive_size_choices = [ (100, "100 GB"),
247                                (250, "250 GB"),
248                                (500, "500 GB"),
249                                (750, "750 GB"),
250                                (1000, "1000 GB"),
251                                (2000, "2000 GB"),
252                                (3000, "3000 GB"),
253                                ]
254         drive_size, i_drive_size = \
255                     get_and_set("drive_size", drive_size_choices, 3000, int)
256         drive_size = drive_size * 1e9
257         add_input("Drives",
258                   "What is the capacity of each hard drive?", i_drive_size)
259         drive_failure_model_choices = [ ("E", "Exponential"),
260                                         ("U", "Uniform"),
261                                         ]
262         drive_failure_model, i_drive_failure_model = \
263                              get_and_set("drive_failure_model",
264                                          drive_failure_model_choices,
265                                          "E", str)
266         add_input("Drives",
267                   "How should we model drive failures?", i_drive_failure_model)
268
269         # drive_failure_rate is in failures per second
270         if drive_failure_model == "E":
271             drive_failure_rate = 1.0 / (drive_MTBF * 1000 * 3600)
272         else:
273             drive_failure_rate = 0.5 / (drive_MTBF * 1000 * 3600)
274
275         # deletion/gc/ownership mode
276         ownership_choices = [ ("A", "no deletion, no gc, no owners"),
277                               ("B", "deletion, no gc, no owners"),
278                               ("C", "deletion, share timers, no owners"),
279                               ("D", "deletion, no gc, yes owners"),
280                               ("E", "deletion, owner timers"),
281                               ]
282         ownership_mode, i_ownership_mode = \
283                         get_and_set("ownership_mode", ownership_choices,
284                                     "A", str)
285         add_input("Servers",
286                   "What is the ownership mode?", i_ownership_mode)
287
288         # client access behavior
289         access_rates = [ (1, "one file per day"),
290                          (10, "10 files per day"),
291                          (100, "100 files per day"),
292                          (1000, "1k files per day"),
293                          (10e3, "10k files per day"),
294                          (100e3, "100k files per day"),
295                          ]
296         download_files_per_day, i_download_rate = \
297                                 get_and_set("download_rate", access_rates,
298                                             100, int)
299         add_input("Users",
300                   "How many files are downloaded per day?", i_download_rate)
301         download_rate = 1.0 * download_files_per_day / (24*60*60)
302
303         upload_files_per_day, i_upload_rate = \
304                               get_and_set("upload_rate", access_rates,
305                                           10, int)
306         add_input("Users",
307                   "How many files are uploaded per day?", i_upload_rate)
308         upload_rate = 1.0 * upload_files_per_day / (24*60*60)
309
310         delete_files_per_day, i_delete_rate = \
311                               get_and_set("delete_rate", access_rates,
312                                           10, int)
313         add_input("Users",
314                   "How many files are deleted per day?", i_delete_rate)
315         delete_rate = 1.0 * delete_files_per_day / (24*60*60)
316
317
318         # the value is in days
319         lease_timers = [ (1, "one refresh per day"),
320                          (7, "one refresh per week"),
321                          ]
322         lease_timer, i_lease = \
323                      get_and_set("lease_timer", lease_timers,
324                                  7, int)
325         add_input("Users",
326                   "How frequently do clients refresh files or accounts? "
327                   "(if necessary)",
328                   i_lease)
329         seconds_per_lease = 24*60*60*lease_timer
330
331         check_timer_choices = [ (1, "every week"),
332                                 (4, "every month"),
333                                 (8, "every two months"),
334                                 (16, "every four months"),
335                                 ]
336         check_timer, i_check_timer = \
337                      get_and_set("check_timer", check_timer_choices, 4, int)
338         add_input("Users",
339                   "How frequently should we check on each file?",
340                   i_check_timer)
341         file_check_interval = check_timer * 7 * 24 * 3600
342
343
344         if filled:
345             add_output("Users", T.div["Total users: %s" % number(num_users)])
346             add_output("Users",
347                        T.div["Files per user: %s" % number(files_per_user)])
348             file_size = 1.0 * space_per_user / files_per_user
349             add_output("Users",
350                        T.div["Average file size: ", number(file_size)])
351             total_files = num_users * files_per_user / sharing_ratio
352
353             add_output("Grid",
354                        T.div["Total number of files in grid: ",
355                              number(total_files)])
356             total_space = num_users * space_per_user / sharing_ratio
357             add_output("Grid",
358                        T.div["Total volume of plaintext in grid: ",
359                              number(total_space, "B")])
360
361             total_shares = n * total_files
362             add_output("Grid",
363                        T.div["Total shares in grid: ", number(total_shares)])
364             expansion = float(n) / float(k)
365
366             total_usage = expansion * total_space
367             add_output("Grid",
368                        T.div["Share data in grid: ", number(total_usage, "B")])
369
370             if n > num_servers:
371                 # silly configuration, causes Tahoe2 to wrap and put multiple
372                 # shares on some servers.
373                 add_output("Servers",
374                            T.div["non-ideal: more shares than servers"
375                                  " (n=%d, servers=%d)" % (n, num_servers)])
376                 # every file has at least one share on every server
377                 buckets_per_server = total_files
378                 shares_per_server = total_files * ((1.0 * n) / num_servers)
379             else:
380                 # if nobody is full, then no lease requests will be turned
381                 # down for lack of space, and no two shares for the same file
382                 # will share a server. Therefore the chance that any given
383                 # file has a share on any given server is n/num_servers.
384                 buckets_per_server = total_files * ((1.0 * n) / num_servers)
385                 # since each such represented file only puts one share on a
386                 # server, the total number of shares per server is the same.
387                 shares_per_server = buckets_per_server
388             add_output("Servers",
389                        T.div["Buckets per server: ",
390                              number(buckets_per_server)])
391             add_output("Servers",
392                        T.div["Shares per server: ",
393                              number(shares_per_server)])
394
395             # how much space is used on the storage servers for the shares?
396             #  the share data itself
397             share_data_per_server = total_usage / num_servers
398             add_output("Servers",
399                        T.div["Share data per server: ",
400                              number(share_data_per_server, "B")])
401             # this is determined empirically. H=hashsize=32, for a one-segment
402             # file and 3-of-10 encoding
403             share_validation_per_server = 266 * shares_per_server
404             # this could be 423*buckets_per_server, if we moved the URI
405             # extension into a separate file, but that would actually consume
406             # *more* space (minimum filesize is 4KiB), unless we moved all
407             # shares for a given bucket into a single file.
408             share_uri_extension_per_server = 423 * shares_per_server
409
410             # ownership mode adds per-bucket data
411             H = 32 # depends upon the desired security of delete/refresh caps
412             # bucket_lease_size is the amount of data needed to keep track of
413             # the delete/refresh caps for each bucket.
414             bucket_lease_size = 0
415             client_bucket_refresh_rate = 0
416             owner_table_size = 0
417             if ownership_mode in ("B", "C", "D", "E"):
418                 bucket_lease_size = sharing_ratio * 1.0 * H
419             if ownership_mode in ("B", "C"):
420                 # refreshes per second per client
421                 client_bucket_refresh_rate = (1.0 * n * files_per_user /
422                                               seconds_per_lease)
423                 add_output("Users",
424                            T.div["Client share refresh rate (outbound): ",
425                                  number(client_bucket_refresh_rate, "Hz")])
426                 server_bucket_refresh_rate = (client_bucket_refresh_rate *
427                                               num_users / num_servers)
428                 add_output("Servers",
429                            T.div["Server share refresh rate (inbound): ",
430                                  number(server_bucket_refresh_rate, "Hz")])
431             if ownership_mode in ("D", "E"):
432                 # each server must maintain a bidirectional mapping from
433                 # buckets to owners. One way to implement this would be to
434                 # put a list of four-byte owner numbers into each bucket, and
435                 # a list of four-byte share numbers into each owner (although
436                 # of course we'd really just throw it into a database and let
437                 # the experts take care of the details).
438                 owner_table_size = 2*(buckets_per_server * sharing_ratio * 4)
439
440             if ownership_mode in ("E",):
441                 # in this mode, clients must refresh one timer per server
442                 client_account_refresh_rate = (1.0 * num_servers /
443                                                seconds_per_lease)
444                 add_output("Users",
445                            T.div["Client account refresh rate (outbound): ",
446                                  number(client_account_refresh_rate, "Hz")])
447                 server_account_refresh_rate = (client_account_refresh_rate *
448                                               num_users / num_servers)
449                 add_output("Servers",
450                            T.div["Server account refresh rate (inbound): ",
451                                  number(server_account_refresh_rate, "Hz")])
452
453             # TODO: buckets vs shares here is a bit wonky, but in
454             # non-wrapping grids it shouldn't matter
455             share_lease_per_server = bucket_lease_size * buckets_per_server
456             share_ownertable_per_server = owner_table_size
457
458             share_space_per_server = (share_data_per_server +
459                                       share_validation_per_server +
460                                       share_uri_extension_per_server +
461                                       share_lease_per_server +
462                                       share_ownertable_per_server)
463             add_output("Servers",
464                        T.div["Share space per server: ",
465                              number(share_space_per_server, "B"),
466                              " (data ",
467                              number(share_data_per_server, "B"),
468                              ", validation ",
469                              number(share_validation_per_server, "B"),
470                              ", UEB ",
471                              number(share_uri_extension_per_server, "B"),
472                              ", lease ",
473                              number(share_lease_per_server, "B"),
474                              ", ownertable ",
475                              number(share_ownertable_per_server, "B"),
476                              ")",
477                              ])
478
479
480             # rates
481             client_download_share_rate = download_rate * k
482             client_download_byte_rate = download_rate * file_size
483             add_output("Users",
484                        T.div["download rate: shares = ",
485                              number(client_download_share_rate, "Hz"),
486                              " , bytes = ",
487                              number(client_download_byte_rate, "Bps"),
488                              ])
489             total_file_check_rate = 1.0 * total_files / file_check_interval
490             client_check_share_rate = total_file_check_rate / num_users
491             add_output("Users",
492                        T.div["file check rate: shares = ",
493                              number(client_check_share_rate, "Hz"),
494                              " (interval = %s)" %
495                              number(1 / client_check_share_rate, "s"),
496                              ])
497
498             client_upload_share_rate = upload_rate * n
499             # TODO: doesn't include overhead
500             client_upload_byte_rate = upload_rate * file_size * expansion
501             add_output("Users",
502                        T.div["upload rate: shares = ",
503                              number(client_upload_share_rate, "Hz"),
504                              " , bytes = ",
505                              number(client_upload_byte_rate, "Bps"),
506                              ])
507             client_delete_share_rate = delete_rate * n
508
509             server_inbound_share_rate = (client_upload_share_rate *
510                                          num_users / num_servers)
511             server_inbound_byte_rate = (client_upload_byte_rate *
512                                         num_users / num_servers)
513             add_output("Servers",
514                        T.div["upload rate (inbound): shares = ",
515                              number(server_inbound_share_rate, "Hz"),
516                              " , bytes = ",
517                               number(server_inbound_byte_rate, "Bps"),
518                              ])
519             add_output("Servers",
520                        T.div["share check rate (inbound): ",
521                              number(total_file_check_rate * n / num_servers,
522                                     "Hz"),
523                              ])
524
525             server_share_modify_rate = ((client_upload_share_rate +
526                                          client_delete_share_rate) *
527                                          num_users / num_servers)
528             add_output("Servers",
529                        T.div["share modify rate: shares = ",
530                              number(server_share_modify_rate, "Hz"),
531                              ])
532
533             server_outbound_share_rate = (client_download_share_rate *
534                                           num_users / num_servers)
535             server_outbound_byte_rate = (client_download_byte_rate *
536                                          num_users / num_servers)
537             add_output("Servers",
538                        T.div["download rate (outbound): shares = ",
539                              number(server_outbound_share_rate, "Hz"),
540                              " , bytes = ",
541                               number(server_outbound_byte_rate, "Bps"),
542                              ])
543
544
545             total_share_space = num_servers * share_space_per_server
546             add_output("Grid",
547                        T.div["Share space consumed: ",
548                              number(total_share_space, "B")])
549             add_output("Grid",
550                        T.div[" %% validation: %.2f%%" %
551                              (100.0 * share_validation_per_server /
552                               share_space_per_server)])
553             add_output("Grid",
554                        T.div[" %% uri-extension: %.2f%%" %
555                              (100.0 * share_uri_extension_per_server /
556                               share_space_per_server)])
557             add_output("Grid",
558                        T.div[" %% lease data: %.2f%%" %
559                              (100.0 * share_lease_per_server /
560                               share_space_per_server)])
561             add_output("Grid",
562                        T.div[" %% owner data: %.2f%%" %
563                              (100.0 * share_ownertable_per_server /
564                               share_space_per_server)])
565             add_output("Grid",
566                        T.div[" %% share data: %.2f%%" %
567                              (100.0 * share_data_per_server /
568                               share_space_per_server)])
569             add_output("Grid",
570                        T.div["file check rate: ",
571                              number(total_file_check_rate,
572                                     "Hz")])
573
574             total_drives = max(div_ceil(int(total_share_space),
575                                         int(drive_size)),
576                                num_servers)
577             add_output("Drives",
578                        T.div["Total drives: ", number(total_drives), " drives"])
579             drives_per_server = div_ceil(total_drives, num_servers)
580             add_output("Servers",
581                        T.div["Drives per server: ", drives_per_server])
582
583             # costs
584             if drive_size == 3000 * 1e9:
585                 add_output("Servers", T.div["3000GB drive: $250 each"])
586                 drive_cost = 250
587             else:
588                 add_output("Servers",
589                            T.div[T.b["unknown cost per drive, assuming $100"]])
590                 drive_cost = 100
591
592             if drives_per_server <= 4:
593                 add_output("Servers", T.div["1U box with <= 4 drives: $1500"])
594                 server_cost = 1500 # typical 1U box
595             elif drives_per_server <= 12:
596                 add_output("Servers", T.div["2U box with <= 12 drives: $2500"])
597                 server_cost = 2500 # 2U box
598             else:
599                 add_output("Servers",
600                            T.div[T.b["Note: too many drives per server, "
601                                      "assuming $3000"]])
602                 server_cost = 3000
603
604             server_capital_cost = (server_cost + drives_per_server * drive_cost)
605             total_server_cost = float(num_servers * server_capital_cost)
606             add_output("Servers", T.div["Capital cost per server: $",
607                                         server_capital_cost])
608             add_output("Grid", T.div["Capital cost for all servers: $",
609                                      number(total_server_cost)])
610             # $70/Mbps/mo
611             # $44/server/mo power+space
612             server_bandwidth = max(server_inbound_byte_rate,
613                                    server_outbound_byte_rate)
614             server_bandwidth_mbps = div_ceil(int(server_bandwidth*8), int(1e6))
615             server_monthly_cost = 70*server_bandwidth_mbps + 44
616             add_output("Servers", T.div["Monthly cost per server: $",
617                                         server_monthly_cost])
618             add_output("Users", T.div["Capital cost per user: $",
619                                       number(total_server_cost / num_users)])
620
621             # reliability
622             any_drive_failure_rate = total_drives * drive_failure_rate
623             any_drive_MTBF = 1 // any_drive_failure_rate  # in seconds
624             any_drive_MTBF_days = any_drive_MTBF / 86400
625             add_output("Drives",
626                        T.div["MTBF (any drive): ",
627                              number(any_drive_MTBF_days), " days"])
628             drive_replacement_monthly_cost = (float(drive_cost)
629                                               * any_drive_failure_rate
630                                               *30*86400)
631             add_output("Grid",
632                        T.div["Monthly cost of replacing drives: $",
633                              number(drive_replacement_monthly_cost)])
634
635             total_server_monthly_cost = float(num_servers * server_monthly_cost
636                                               + drive_replacement_monthly_cost)
637
638             add_output("Grid", T.div["Monthly cost for all servers: $",
639                                      number(total_server_monthly_cost)])
640             add_output("Users",
641                        T.div["Monthly cost per user: $",
642                              number(total_server_monthly_cost / num_users)])
643
644             # availability
645             file_dBA = self.file_availability(k, n, server_dBA)
646             user_files_dBA = self.many_files_availability(file_dBA,
647                                                           files_per_user)
648             all_files_dBA = self.many_files_availability(file_dBA, total_files)
649             add_output("Users",
650                        T.div["availability of: ",
651                              "arbitrary file = %d dBA, " % file_dBA,
652                              "all files of user1 = %d dBA, " % user_files_dBA,
653                              "all files in grid = %d dBA" % all_files_dBA,
654                              ],
655                        )
656
657             time_until_files_lost = (n-k+1) / any_drive_failure_rate
658             add_output("Grid",
659                        T.div["avg time until files are lost: ",
660                              number(time_until_files_lost, "s"), ", ",
661                              number(time_until_files_lost/86400, " days"),
662                              ])
663
664             share_data_loss_rate = any_drive_failure_rate * drive_size
665             add_output("Grid",
666                        T.div["share data loss rate: ",
667                              number(share_data_loss_rate,"Bps")])
668
669             # the worst-case survival numbers occur when we do a file check
670             # and the file is just above the threshold for repair (so we
671             # decide to not repair it). The question is then: what is the
672             # chance that the file will decay so badly before the next check
673             # that we can't recover it? The resulting probability is per
674             # check interval.
675             # Note that the chances of us getting into this situation are low.
676             P_disk_failure_during_interval = (drive_failure_rate *
677                                               file_check_interval)
678             disk_failure_dBF = 10*math.log10(P_disk_failure_during_interval)
679             disk_failure_dBA = -disk_failure_dBF
680             file_survives_dBA = self.file_availability(k, repair_threshold,
681                                                        disk_failure_dBA)
682             user_files_survives_dBA = self.many_files_availability( \
683                 file_survives_dBA, files_per_user)
684             all_files_survives_dBA = self.many_files_availability( \
685                 file_survives_dBA, total_files)
686             add_output("Users",
687                        T.div["survival of: ",
688                              "arbitrary file = %d dBA, " % file_survives_dBA,
689                              "all files of user1 = %d dBA, " %
690                              user_files_survives_dBA,
691                              "all files in grid = %d dBA" %
692                              all_files_survives_dBA,
693                              " (per worst-case check interval)",
694                              ])
695
696
697
698         all_sections = []
699         all_sections.append(build_section("Users"))
700         all_sections.append(build_section("Servers"))
701         all_sections.append(build_section("Drives"))
702         if "Grid" in sections:
703             all_sections.append(build_section("Grid"))
704
705         f = T.form(action=".", method="post", enctype="multipart/form-data")
706
707         if filled:
708             action = "Recompute"
709         else:
710             action = "Compute"
711
712         f = f[T.input(type="hidden", name="filled", value="true"),
713               T.input(type="submit", value=action),
714               all_sections,
715               ]
716
717         try:
718             from allmydata import reliability
719             # we import this just to test to see if the page is available
720             _hush_pyflakes = reliability
721             del _hush_pyflakes
722             f = [T.div[T.a(href="../reliability")["Reliability Math"]], f]
723         except ImportError:
724             pass
725
726         return f
727
728     def file_availability(self, k, n, server_dBA):
729         """
730         The full formula for the availability of a specific file is::
731
732          1 - sum([choose(N,i) * p**i * (1-p)**(N-i)] for i in range(k)])
733
734         Where choose(N,i) = N! / ( i! * (N-i)! ) . Note that each term of
735         this summation is the probability that there are exactly 'i' servers
736         available, and what we're doing is adding up the cases where i is too
737         low.
738
739         This is a nuisance to calculate at all accurately, especially once N
740         gets large, and when p is close to unity. So we make an engineering
741         approximation: if (1-p) is very small, then each [i] term is much
742         larger than the [i-1] term, and the sum is dominated by the i=k-1
743         term. This only works for (1-p) < 10%, and when the choose() function
744         doesn't rise fast enough to compensate. For high-expansion encodings
745         (3-of-10, 25-of-100), the choose() function is rising at the same
746         time as the (1-p)**(N-i) term, so that's not an issue. For
747         low-expansion encodings (7-of-10, 75-of-100) the two values are
748         moving in opposite directions, so more care must be taken.
749
750         Note that the p**i term has only a minor effect as long as (1-p)*N is
751         small, and even then the effect is attenuated by the 1-p term.
752         """
753
754         assert server_dBA > 9  # >=90% availability to use the approximation
755         factor = binomial(n, k-1)
756         factor_dBA = 10 * math.log10(factor)
757         exponent = n - k + 1
758         file_dBA = server_dBA * exponent - factor_dBA
759         return file_dBA
760
761     def many_files_availability(self, file_dBA, num_files):
762         """The probability that 'num_files' independent bernoulli trials will
763         succeed (i.e. we can recover all files in the grid at any given
764         moment) is p**num_files . Since p is close to unity, we express in p
765         in dBA instead, so we can get useful precision on q (=1-p), and then
766         the formula becomes::
767
768          P_some_files_unavailable = 1 - (1 - q)**num_files
769
770         That (1-q)**n expands with the usual binomial sequence, 1 - nq +
771         Xq**2 ... + Xq**n . We use the same approximation as before, since we
772         know q is close to zero, and we get to ignore all the terms past -nq.
773         """
774
775         many_files_dBA = file_dBA - 10 * math.log10(num_files)
776         return many_files_dBA