]> git.rkrishnan.org Git - tahoe-lafs/tahoe-lafs.git/blob - src/allmydata/test/test_checker.py
816c57fd8d8095e3cf2cd6aa14a3866338d82611
[tahoe-lafs/tahoe-lafs.git] / src / allmydata / test / test_checker.py
1
2 import simplejson
3 import os.path, shutil
4 from twisted.trial import unittest
5 from twisted.internet import defer
6 from allmydata import check_results, uri
7 from allmydata import uri as tahoe_uri
8 from allmydata.util import base32
9 from allmydata.web import check_results as web_check_results
10 from allmydata.storage_client import StorageFarmBroker, NativeStorageServer
11 from allmydata.storage.server import storage_index_to_dir
12 from allmydata.monitor import Monitor
13 from allmydata.test.no_network import GridTestMixin
14 from allmydata.immutable.upload import Data
15 from allmydata.test.common_web import WebRenderingMixin
16 from allmydata.mutable.publish import MutableData
17
18 class FakeClient:
19     def get_storage_broker(self):
20         return self.storage_broker
21
22 class WebResultsRendering(unittest.TestCase, WebRenderingMixin):
23
24     def create_fake_client(self):
25         sb = StorageFarmBroker(None, True)
26         # s.get_name() (the "short description") will be "v0-00000000".
27         # s.get_longname() will include the -long suffix.
28         # s.get_peerid() (i.e. tubid) will be "aaa.." or "777.." or "ceir.."
29         servers = [("v0-00000000-long", "\x00"*20, "peer-0"),
30                    ("v0-ffffffff-long", "\xff"*20, "peer-f"),
31                    ("v0-11111111-long", "\x11"*20, "peer-11")]
32         for (key_s, peerid, nickname) in servers:
33             tubid_b32 = base32.b2a(peerid)
34             furl = "pb://%s@nowhere/fake" % tubid_b32
35             ann = { "version": 0,
36                     "service-name": "storage",
37                     "anonymous-storage-FURL": furl,
38                     "permutation-seed-base32": "",
39                     "nickname": unicode(nickname),
40                     "app-versions": {}, # need #466 and v2 introducer
41                     "my-version": "ver",
42                     "oldest-supported": "oldest",
43                     }
44             s = NativeStorageServer(key_s, ann)
45             sb.test_add_server(peerid, s) # XXX: maybe use key_s?
46         c = FakeClient()
47         c.storage_broker = sb
48         return c
49
50     def render_json(self, page):
51         d = self.render1(page, args={"output": ["json"]})
52         return d
53
54     def test_literal(self):
55         c = self.create_fake_client()
56         lcr = web_check_results.LiteralCheckResultsRenderer(c)
57
58         d = self.render1(lcr)
59         def _check(html):
60             s = self.remove_tags(html)
61             self.failUnlessIn("Literal files are always healthy", s)
62         d.addCallback(_check)
63         d.addCallback(lambda ignored:
64                       self.render1(lcr, args={"return_to": ["FOOURL"]}))
65         def _check_return_to(html):
66             s = self.remove_tags(html)
67             self.failUnlessIn("Literal files are always healthy", s)
68             self.failUnlessIn('<a href="FOOURL">Return to file.</a>',
69                               html)
70         d.addCallback(_check_return_to)
71         d.addCallback(lambda ignored: self.render_json(lcr))
72         def _check_json(json):
73             j = simplejson.loads(json)
74             self.failUnlessEqual(j["storage-index"], "")
75             self.failUnlessEqual(j["results"]["healthy"], True)
76         d.addCallback(_check_json)
77         return d
78
79     def test_check(self):
80         c = self.create_fake_client()
81         serverid_1 = "\x00"*20
82         serverid_f = "\xff"*20
83         u = uri.CHKFileURI("\x00"*16, "\x00"*32, 3, 10, 1234)
84         cr = check_results.CheckResults(u, u.get_storage_index())
85         cr.set_healthy(True)
86         cr.set_needs_rebalancing(False)
87         cr.set_summary("groovy")
88         data = { "count-shares-needed": 3,
89                  "count-shares-expected": 9,
90                  "count-shares-good": 10,
91                  "count-good-share-hosts": 11,
92                  "count-corrupt-shares": 0,
93                  "list-corrupt-shares": [],
94                  "count-wrong-shares": 0,
95                  "sharemap": {"shareid1": [serverid_1, serverid_f]},
96                  "count-recoverable-versions": 1,
97                  "count-unrecoverable-versions": 0,
98                  "servers-responding": [],
99                  }
100         cr.set_data(data)
101
102         w = web_check_results.CheckResultsRenderer(c, cr)
103         html = self.render2(w)
104         s = self.remove_tags(html)
105         self.failUnlessIn("File Check Results for SI=2k6avp", s) # abbreviated
106         self.failUnlessIn("Healthy : groovy", s)
107         self.failUnlessIn("Share Counts: need 3-of-9, have 10", s)
108         self.failUnlessIn("Hosts with good shares: 11", s)
109         self.failUnlessIn("Corrupt shares: none", s)
110         self.failUnlessIn("Wrong Shares: 0", s)
111         self.failUnlessIn("Recoverable Versions: 1", s)
112         self.failUnlessIn("Unrecoverable Versions: 0", s)
113
114         cr.set_healthy(False)
115         cr.set_recoverable(True)
116         cr.set_summary("ungroovy")
117         html = self.render2(w)
118         s = self.remove_tags(html)
119         self.failUnlessIn("File Check Results for SI=2k6avp", s) # abbreviated
120         self.failUnlessIn("Not Healthy! : ungroovy", s)
121
122         cr.set_healthy(False)
123         cr.set_recoverable(False)
124         cr.set_summary("rather dead")
125         data["count-corrupt-shares"] = 1
126         data["list-corrupt-shares"] = [(serverid_1, u.get_storage_index(), 2)]
127         cr.set_data(data)
128         html = self.render2(w)
129         s = self.remove_tags(html)
130         self.failUnlessIn("File Check Results for SI=2k6avp", s) # abbreviated
131         self.failUnlessIn("Not Recoverable! : rather dead", s)
132         self.failUnlessIn("Corrupt shares: Share ID Nickname Node ID sh#2 peer-0 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", s)
133
134         html = self.render2(w)
135         s = self.remove_tags(html)
136         self.failUnlessIn("File Check Results for SI=2k6avp", s) # abbreviated
137         self.failUnlessIn("Not Recoverable! : rather dead", s)
138
139         html = self.render2(w, args={"return_to": ["FOOURL"]})
140         self.failUnlessIn('<a href="FOOURL">Return to file/directory.</a>',
141                           html)
142
143         d = self.render_json(w)
144         def _check_json(jdata):
145             j = simplejson.loads(jdata)
146             self.failUnlessEqual(j["summary"], "rather dead")
147             self.failUnlessEqual(j["storage-index"],
148                                  "2k6avpjga3dho3zsjo6nnkt7n4")
149             expected = {'needs-rebalancing': False,
150                         'count-shares-expected': 9,
151                         'healthy': False,
152                         'count-unrecoverable-versions': 0,
153                         'count-shares-needed': 3,
154                         'sharemap': {"shareid1":
155                                      ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
156                                       "77777777777777777777777777777777"]},
157                         'count-recoverable-versions': 1,
158                         'list-corrupt-shares':
159                         [["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
160                           "2k6avpjga3dho3zsjo6nnkt7n4", 2]],
161                         'count-good-share-hosts': 11,
162                         'count-wrong-shares': 0,
163                         'count-shares-good': 10,
164                         'count-corrupt-shares': 1,
165                         'servers-responding': [],
166                         'recoverable': False,
167                         }
168             self.failUnlessEqual(j["results"], expected)
169         d.addCallback(_check_json)
170         d.addCallback(lambda ignored: self.render1(w))
171         def _check(html):
172             s = self.remove_tags(html)
173             self.failUnlessIn("File Check Results for SI=2k6avp", s)
174             self.failUnlessIn("Not Recoverable! : rather dead", s)
175         d.addCallback(_check)
176         return d
177
178
179     def test_check_and_repair(self):
180         c = self.create_fake_client()
181         serverid_1 = "\x00"*20
182         serverid_f = "\xff"*20
183         u = uri.CHKFileURI("\x00"*16, "\x00"*32, 3, 10, 1234)
184
185         pre_cr = check_results.CheckResults(u, u.get_storage_index())
186         pre_cr.set_healthy(False)
187         pre_cr.set_recoverable(True)
188         pre_cr.set_needs_rebalancing(False)
189         pre_cr.set_summary("illing")
190         data = { "count-shares-needed": 3,
191                  "count-shares-expected": 10,
192                  "count-shares-good": 6,
193                  "count-good-share-hosts": 7,
194                  "count-corrupt-shares": 0,
195                  "list-corrupt-shares": [],
196                  "count-wrong-shares": 0,
197                  "sharemap": {"shareid1": [serverid_1, serverid_f]},
198                  "count-recoverable-versions": 1,
199                  "count-unrecoverable-versions": 0,
200                  "servers-responding": [],
201                  }
202         pre_cr.set_data(data)
203
204         post_cr = check_results.CheckResults(u, u.get_storage_index())
205         post_cr.set_healthy(True)
206         post_cr.set_recoverable(True)
207         post_cr.set_needs_rebalancing(False)
208         post_cr.set_summary("groovy")
209         data = { "count-shares-needed": 3,
210                  "count-shares-expected": 10,
211                  "count-shares-good": 10,
212                  "count-good-share-hosts": 11,
213                  "count-corrupt-shares": 0,
214                  "list-corrupt-shares": [],
215                  "count-wrong-shares": 0,
216                  "sharemap": {"shareid1": [serverid_1, serverid_f]},
217                  "count-recoverable-versions": 1,
218                  "count-unrecoverable-versions": 0,
219                  "servers-responding": [],
220                  }
221         post_cr.set_data(data)
222
223         crr = check_results.CheckAndRepairResults(u.get_storage_index())
224         crr.pre_repair_results = pre_cr
225         crr.post_repair_results = post_cr
226         crr.repair_attempted = False
227
228         w = web_check_results.CheckAndRepairResultsRenderer(c, crr)
229         html = self.render2(w)
230         s = self.remove_tags(html)
231
232         self.failUnlessIn("File Check-And-Repair Results for SI=2k6avp", s)
233         self.failUnlessIn("Healthy : groovy", s)
234         self.failUnlessIn("No repair necessary", s)
235         self.failUnlessIn("Post-Repair Checker Results:", s)
236         self.failUnlessIn("Share Counts: need 3-of-10, have 10", s)
237
238         crr.repair_attempted = True
239         crr.repair_successful = True
240         html = self.render2(w)
241         s = self.remove_tags(html)
242
243         self.failUnlessIn("File Check-And-Repair Results for SI=2k6avp", s)
244         self.failUnlessIn("Healthy : groovy", s)
245         self.failUnlessIn("Repair successful", s)
246         self.failUnlessIn("Post-Repair Checker Results:", s)
247
248         crr.repair_attempted = True
249         crr.repair_successful = False
250         post_cr.set_healthy(False)
251         post_cr.set_summary("better")
252         html = self.render2(w)
253         s = self.remove_tags(html)
254
255         self.failUnlessIn("File Check-And-Repair Results for SI=2k6avp", s)
256         self.failUnlessIn("Not Healthy! : better", s)
257         self.failUnlessIn("Repair unsuccessful", s)
258         self.failUnlessIn("Post-Repair Checker Results:", s)
259
260         crr.repair_attempted = True
261         crr.repair_successful = False
262         post_cr.set_healthy(False)
263         post_cr.set_recoverable(False)
264         post_cr.set_summary("worse")
265         html = self.render2(w)
266         s = self.remove_tags(html)
267
268         self.failUnlessIn("File Check-And-Repair Results for SI=2k6avp", s)
269         self.failUnlessIn("Not Recoverable! : worse", s)
270         self.failUnlessIn("Repair unsuccessful", s)
271         self.failUnlessIn("Post-Repair Checker Results:", s)
272
273         d = self.render_json(w)
274         def _got_json(data):
275             j = simplejson.loads(data)
276             self.failUnlessEqual(j["repair-attempted"], True)
277             self.failUnlessEqual(j["storage-index"],
278                                  "2k6avpjga3dho3zsjo6nnkt7n4")
279             self.failUnlessEqual(j["pre-repair-results"]["summary"], "illing")
280             self.failUnlessEqual(j["post-repair-results"]["summary"], "worse")
281         d.addCallback(_got_json)
282
283         w2 = web_check_results.CheckAndRepairResultsRenderer(c, None)
284         d.addCallback(lambda ignored: self.render_json(w2))
285         def _got_lit_results(data):
286             j = simplejson.loads(data)
287             self.failUnlessEqual(j["repair-attempted"], False)
288             self.failUnlessEqual(j["storage-index"], "")
289         d.addCallback(_got_lit_results)
290         return d
291
292 class BalancingAct(GridTestMixin, unittest.TestCase):
293     # test for #1115 regarding the 'count-good-share-hosts' metric
294
295
296     def add_server(self, server_number, readonly=False):
297         assert self.g, "I tried to find a grid at self.g, but failed"
298         ss = self.g.make_server(server_number, readonly)
299         #log.msg("just created a server, number: %s => %s" % (server_number, ss,))
300         self.g.add_server(server_number, ss)
301
302     def add_server_with_share(self, server_number, uri, share_number=None,
303                               readonly=False):
304         self.add_server(server_number, readonly)
305         if share_number is not None:
306             self.copy_share_to_server(uri, share_number, server_number)
307
308     def copy_share_to_server(self, uri, share_number, server_number):
309         ss = self.g.servers_by_number[server_number]
310         # Copy share i from the directory associated with the first
311         # storage server to the directory associated with this one.
312         assert self.g, "I tried to find a grid at self.g, but failed"
313         assert self.shares, "I tried to find shares at self.shares, but failed"
314         old_share_location = self.shares[share_number][2]
315         new_share_location = os.path.join(ss.storedir, "shares")
316         si = tahoe_uri.from_string(self.uri).get_storage_index()
317         new_share_location = os.path.join(new_share_location,
318                                           storage_index_to_dir(si))
319         if not os.path.exists(new_share_location):
320             os.makedirs(new_share_location)
321         new_share_location = os.path.join(new_share_location,
322                                           str(share_number))
323         if old_share_location != new_share_location:
324             shutil.copy(old_share_location, new_share_location)
325         shares = self.find_uri_shares(uri)
326         # Make sure that the storage server has the share.
327         self.failUnless((share_number, ss.my_nodeid, new_share_location)
328                         in shares)
329
330     def _pretty_shares_chart(self, uri):
331         # Servers are labeled A-Z, shares are labeled 0-9
332         letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
333         assert len(self.g.servers_by_number) < len(letters), \
334             "This little printing function is only meant for < 26 servers"
335         shares_chart = {}
336         names = dict(zip([ss.my_nodeid
337                           for _,ss in self.g.servers_by_number.iteritems()],
338                          letters))
339         for shnum, serverid, _ in self.find_uri_shares(uri):
340             shares_chart.setdefault(shnum, []).append(names[serverid])
341         return shares_chart
342
343     def test_good_share_hosts(self):
344         self.basedir = "checker/BalancingAct/1115"
345         self.set_up_grid(num_servers=1)
346         c0 = self.g.clients[0]
347         c0.DEFAULT_ENCODING_PARAMETERS['happy'] = 1
348         c0.DEFAULT_ENCODING_PARAMETERS['n'] = 4
349         c0.DEFAULT_ENCODING_PARAMETERS['k'] = 3
350
351         DATA = "data" * 100
352         d = c0.upload(Data(DATA, convergence=""))
353         def _stash_immutable(ur):
354             self.imm = c0.create_node_from_uri(ur.get_uri())
355             self.uri = self.imm.get_uri()
356         d.addCallback(_stash_immutable)
357         d.addCallback(lambda ign:
358             self.find_uri_shares(self.uri))
359         def _store_shares(shares):
360             self.shares = shares
361         d.addCallback(_store_shares)
362
363         def add_three(_, i):
364             # Add a new server with just share 3
365             self.add_server_with_share(i, self.uri, 3)
366             #print self._pretty_shares_chart(self.uri)
367         for i in range(1,5):
368             d.addCallback(add_three, i)
369
370         def _check_and_repair(_):
371             return self.imm.check_and_repair(Monitor())
372         def _check_counts(crr, shares_good, good_share_hosts):
373             p_crr = crr.get_post_repair_results().get_data()
374             #print self._pretty_shares_chart(self.uri)
375             self.failUnlessEqual(p_crr['count-shares-good'], shares_good)
376             self.failUnlessEqual(p_crr['count-good-share-hosts'],
377                                  good_share_hosts)
378
379         """
380         Initial sharemap:
381             0:[A] 1:[A] 2:[A] 3:[A,B,C,D,E]
382           4 good shares, but 5 good hosts
383         After deleting all instances of share #3 and repairing:
384             0:[A,B], 1:[A,C], 2:[A,D], 3:[E]
385           Still 4 good shares and 5 good hosts
386             """
387         d.addCallback(_check_and_repair)
388         d.addCallback(_check_counts, 4, 5)
389         d.addCallback(lambda _: self.delete_shares_numbered(self.uri, [3]))
390         d.addCallback(_check_and_repair)
391         d.addCallback(_check_counts, 4, 5)
392         d.addCallback(lambda _: [self.g.break_server(sid)
393                                  for sid in self.g.get_all_serverids()])
394         d.addCallback(_check_and_repair)
395         d.addCallback(_check_counts, 0, 0)
396         return d
397
398 class AddLease(GridTestMixin, unittest.TestCase):
399     # test for #875, in which failures in the add-lease call cause
400     # false-negatives in the checker
401
402     def test_875(self):
403         self.basedir = "checker/AddLease/875"
404         self.set_up_grid(num_servers=1)
405         c0 = self.g.clients[0]
406         c0.DEFAULT_ENCODING_PARAMETERS['happy'] = 1
407         self.uris = {}
408         DATA = "data" * 100
409         d = c0.upload(Data(DATA, convergence=""))
410         def _stash_immutable(ur):
411             self.imm = c0.create_node_from_uri(ur.get_uri())
412         d.addCallback(_stash_immutable)
413         d.addCallback(lambda ign:
414             c0.create_mutable_file(MutableData("contents")))
415         def _stash_mutable(node):
416             self.mut = node
417         d.addCallback(_stash_mutable)
418
419         def _check_cr(cr, which):
420             self.failUnless(cr.is_healthy(), which)
421
422         # these two should work normally
423         d.addCallback(lambda ign: self.imm.check(Monitor(), add_lease=True))
424         d.addCallback(_check_cr, "immutable-normal")
425         d.addCallback(lambda ign: self.mut.check(Monitor(), add_lease=True))
426         d.addCallback(_check_cr, "mutable-normal")
427
428         really_did_break = []
429         # now break the server's remote_add_lease call
430         def _break_add_lease(ign):
431             def broken_add_lease(*args, **kwargs):
432                 really_did_break.append(1)
433                 raise KeyError("intentional failure, should be ignored")
434             assert self.g.servers_by_number[0].remote_add_lease
435             self.g.servers_by_number[0].remote_add_lease = broken_add_lease
436         d.addCallback(_break_add_lease)
437
438         # and confirm that the files still look healthy
439         d.addCallback(lambda ign: self.mut.check(Monitor(), add_lease=True))
440         d.addCallback(_check_cr, "mutable-broken")
441         d.addCallback(lambda ign: self.imm.check(Monitor(), add_lease=True))
442         d.addCallback(_check_cr, "immutable-broken")
443
444         d.addCallback(lambda ign: self.failUnless(really_did_break))
445         return d
446
447 class CounterHolder(object):
448     def __init__(self):
449         self._num_active_block_fetches = 0
450         self._max_active_block_fetches = 0
451
452 from allmydata.immutable.checker import ValidatedReadBucketProxy
453 class MockVRBP(ValidatedReadBucketProxy):
454     def __init__(self, sharenum, bucket, share_hash_tree, num_blocks, block_size, share_size, counterholder):
455         ValidatedReadBucketProxy.__init__(self, sharenum, bucket,
456                                           share_hash_tree, num_blocks,
457                                           block_size, share_size)
458         self.counterholder = counterholder
459
460     def get_block(self, blocknum):
461         self.counterholder._num_active_block_fetches += 1
462         if self.counterholder._num_active_block_fetches > self.counterholder._max_active_block_fetches:
463             self.counterholder._max_active_block_fetches = self.counterholder._num_active_block_fetches
464         d = ValidatedReadBucketProxy.get_block(self, blocknum)
465         def _mark_no_longer_active(res):
466             self.counterholder._num_active_block_fetches -= 1
467             return res
468         d.addBoth(_mark_no_longer_active)
469         return d
470
471 class TooParallel(GridTestMixin, unittest.TestCase):
472     # bug #1395: immutable verifier was aggressively parallized, checking all
473     # blocks of all shares at the same time, blowing our memory budget and
474     # crashing with MemoryErrors on >1GB files.
475
476     def test_immutable(self):
477         import allmydata.immutable.checker
478         origVRBP = allmydata.immutable.checker.ValidatedReadBucketProxy
479
480         self.basedir = "checker/TooParallel/immutable"
481
482         # If any code asks to instantiate a ValidatedReadBucketProxy,
483         # we give them a MockVRBP which is configured to use our
484         # CounterHolder.
485         counterholder = CounterHolder()
486         def make_mock_VRBP(*args, **kwargs):
487             return MockVRBP(counterholder=counterholder, *args, **kwargs)
488         allmydata.immutable.checker.ValidatedReadBucketProxy = make_mock_VRBP
489
490         d = defer.succeed(None)
491         def _start(ign):
492             self.set_up_grid(num_servers=4)
493             self.c0 = self.g.clients[0]
494             self.c0.DEFAULT_ENCODING_PARAMETERS = { "k": 1,
495                                                "happy": 4,
496                                                "n": 4,
497                                                "max_segment_size": 5,
498                                                }
499             self.uris = {}
500             DATA = "data" * 100 # 400/5 = 80 blocks
501             return self.c0.upload(Data(DATA, convergence=""))
502         d.addCallback(_start)
503         def _do_check(ur):
504             n = self.c0.create_node_from_uri(ur.get_uri())
505             return n.check(Monitor(), verify=True)
506         d.addCallback(_do_check)
507         def _check(cr):
508             # the verifier works on all 4 shares in parallel, but only
509             # fetches one block from each share at a time, so we expect to
510             # see 4 parallel fetches
511             self.failUnlessEqual(counterholder._max_active_block_fetches, 4)
512         d.addCallback(_check)
513         def _clean_up(res):
514             allmydata.immutable.checker.ValidatedReadBucketProxy = origVRBP
515             return res
516         d.addBoth(_clean_up)
517         return d
518
519     test_immutable.timeout = 80