]> git.rkrishnan.org Git - tahoe-lafs/tahoe-lafs.git/blob - src/allmydata/test/test_hung_server.py
abed967a5f45b6a15a3faa2451c549810a3e77f3
[tahoe-lafs/tahoe-lafs.git] / src / allmydata / test / test_hung_server.py
1 # -*- coding: utf-8 -*-
2
3 import os, shutil
4 from twisted.trial import unittest
5 from twisted.internet import defer
6 from allmydata import uri
7 from allmydata.util.consumer import download_to_data
8 from allmydata.immutable import upload
9 from allmydata.mutable.common import UnrecoverableFileError
10 from allmydata.storage.common import storage_index_to_dir
11 from allmydata.test.no_network import GridTestMixin
12 from allmydata.test.common import ShouldFailMixin
13 from allmydata.util.pollmixin import PollMixin
14 from allmydata.interfaces import NotEnoughSharesError
15
16 immutable_plaintext = "data" * 10000
17 mutable_plaintext = "muta" * 10000
18
19 class HungServerDownloadTest(GridTestMixin, ShouldFailMixin, PollMixin,
20                              unittest.TestCase):
21     # Many of these tests take around 60 seconds on François's ARM buildslave:
22     # http://tahoe-lafs.org/buildbot/builders/FranXois%20lenny-armv5tel
23     # allmydata.test.test_hung_server.HungServerDownloadTest.test_2_good_8_broken_duplicate_share_fail
24     # once ERRORed after 197 seconds on Midnight Magic's NetBSD buildslave:
25     # http://tahoe-lafs.org/buildbot/builders/MM%20netbsd4%20i386%20warp
26     # MM's buildslave varies a lot in how long it takes to run tests.
27
28     timeout = 240
29
30     def _break(self, servers):
31         for (id, ss) in servers:
32             self.g.break_server(id)
33
34     def _hang(self, servers, **kwargs):
35         for (id, ss) in servers:
36             self.g.hang_server(id, **kwargs)
37
38     def _unhang(self, servers, **kwargs):
39         for (id, ss) in servers:
40             self.g.unhang_server(id, **kwargs)
41
42     def _hang_shares(self, shnums, **kwargs):
43         # hang all servers who are holding the given shares
44         hung_serverids = set()
45         for (i_shnum, i_serverid, i_sharefile) in self.shares:
46             if i_shnum in shnums:
47                 if i_serverid not in hung_serverids:
48                     self.g.hang_server(i_serverid, **kwargs)
49                     hung_serverids.add(i_serverid)
50
51     def _delete_all_shares_from(self, servers):
52         serverids = [id for (id, ss) in servers]
53         for (i_shnum, i_serverid, i_sharefile) in self.shares:
54             if i_serverid in serverids:
55                 os.unlink(i_sharefile)
56
57     def _corrupt_all_shares_in(self, servers, corruptor_func):
58         serverids = [id for (id, ss) in servers]
59         for (i_shnum, i_serverid, i_sharefile) in self.shares:
60             if i_serverid in serverids:
61                 self._corrupt_share((i_shnum, i_sharefile), corruptor_func)
62
63     def _copy_all_shares_from(self, from_servers, to_server):
64         serverids = [id for (id, ss) in from_servers]
65         for (i_shnum, i_serverid, i_sharefile) in self.shares:
66             if i_serverid in serverids:
67                 self._copy_share((i_shnum, i_sharefile), to_server)
68
69     def _copy_share(self, share, to_server):
70         (sharenum, sharefile) = share
71         (id, ss) = to_server
72         shares_dir = os.path.join(ss.original.storedir, "shares")
73         si = uri.from_string(self.uri).get_storage_index()
74         si_dir = os.path.join(shares_dir, storage_index_to_dir(si))
75         if not os.path.exists(si_dir):
76             os.makedirs(si_dir)
77         new_sharefile = os.path.join(si_dir, str(sharenum))
78         shutil.copy(sharefile, new_sharefile)
79         self.shares = self.find_uri_shares(self.uri)
80         # Make sure that the storage server has the share.
81         self.failUnless((sharenum, ss.original.my_nodeid, new_sharefile)
82                         in self.shares)
83
84     def _corrupt_share(self, share, corruptor_func):
85         (sharenum, sharefile) = share
86         data = open(sharefile, "rb").read()
87         newdata = corruptor_func(data)
88         os.unlink(sharefile)
89         wf = open(sharefile, "wb")
90         wf.write(newdata)
91         wf.close()
92
93     def _set_up(self, mutable, testdir, num_clients=1, num_servers=10):
94         self.mutable = mutable
95         if mutable:
96             self.basedir = "hung_server/mutable_" + testdir
97         else:
98             self.basedir = "hung_server/immutable_" + testdir
99
100         self.set_up_grid(num_clients=num_clients, num_servers=num_servers)
101
102         self.c0 = self.g.clients[0]
103         nm = self.c0.nodemaker
104         self.servers = sorted([(s.get_serverid(), s.get_rref())
105                                for s in nm.storage_broker.get_connected_servers()])
106         self.servers = self.servers[5:] + self.servers[:5]
107
108         if mutable:
109             d = nm.create_mutable_file(mutable_plaintext)
110             def _uploaded_mutable(node):
111                 self.uri = node.get_uri()
112                 self.shares = self.find_uri_shares(self.uri)
113             d.addCallback(_uploaded_mutable)
114         else:
115             data = upload.Data(immutable_plaintext, convergence="")
116             d = self.c0.upload(data)
117             def _uploaded_immutable(upload_res):
118                 self.uri = upload_res.uri
119                 self.shares = self.find_uri_shares(self.uri)
120             d.addCallback(_uploaded_immutable)
121         return d
122
123     def _start_download(self):
124         n = self.c0.create_node_from_uri(self.uri)
125         if self.mutable:
126             d = n.download_best_version()
127         else:
128             d = download_to_data(n)
129         return d
130
131     def _wait_for_data(self, n):
132         if self.mutable:
133             d = n.download_best_version()
134         else:
135             d = download_to_data(n)
136         return d
137
138     def _check(self, resultingdata):
139         if self.mutable:
140             self.failUnlessEqual(resultingdata, mutable_plaintext)
141         else:
142             self.failUnlessEqual(resultingdata, immutable_plaintext)
143
144     def _download_and_check(self):
145         d = self._start_download()
146         d.addCallback(self._check)
147         return d
148
149     def _should_fail_download(self):
150         if self.mutable:
151             return self.shouldFail(UnrecoverableFileError, self.basedir,
152                                    "no recoverable versions",
153                                    self._download_and_check)
154         else:
155             return self.shouldFail(NotEnoughSharesError, self.basedir,
156                                    "ran out of shares",
157                                    self._download_and_check)
158
159
160     def test_10_good_sanity_check(self):
161         d = defer.succeed(None)
162         for mutable in [False, True]:
163             d.addCallback(lambda ign: self._set_up(mutable, "test_10_good_sanity_check"))
164             d.addCallback(lambda ign: self._download_and_check())
165         return d
166
167     def test_10_good_copied_share(self):
168         d = defer.succeed(None)
169         for mutable in [False, True]:
170             d.addCallback(lambda ign: self._set_up(mutable, "test_10_good_copied_share"))
171             d.addCallback(lambda ign: self._copy_all_shares_from(self.servers[2:3], self.servers[0]))
172             d.addCallback(lambda ign: self._download_and_check())
173             return d
174
175     def test_3_good_7_noshares(self):
176         d = defer.succeed(None)
177         for mutable in [False, True]:
178             d.addCallback(lambda ign: self._set_up(mutable, "test_3_good_7_noshares"))
179             d.addCallback(lambda ign: self._delete_all_shares_from(self.servers[3:]))
180             d.addCallback(lambda ign: self._download_and_check())
181         return d
182
183     def test_2_good_8_broken_fail(self):
184         d = defer.succeed(None)
185         for mutable in [False, True]:
186             d.addCallback(lambda ign: self._set_up(mutable, "test_2_good_8_broken_fail"))
187             d.addCallback(lambda ign: self._break(self.servers[2:]))
188             d.addCallback(lambda ign: self._should_fail_download())
189         return d
190
191     def test_2_good_8_noshares_fail(self):
192         d = defer.succeed(None)
193         for mutable in [False, True]:
194             d.addCallback(lambda ign: self._set_up(mutable, "test_2_good_8_noshares_fail"))
195             d.addCallback(lambda ign: self._delete_all_shares_from(self.servers[2:]))
196             d.addCallback(lambda ign: self._should_fail_download())
197         return d
198
199     def test_2_good_8_broken_copied_share(self):
200         d = defer.succeed(None)
201         for mutable in [False, True]:
202             d.addCallback(lambda ign: self._set_up(mutable, "test_2_good_8_broken_copied_share"))
203             d.addCallback(lambda ign: self._copy_all_shares_from(self.servers[2:3], self.servers[0]))
204             d.addCallback(lambda ign: self._break(self.servers[2:]))
205             d.addCallback(lambda ign: self._download_and_check())
206         return d
207
208     def test_2_good_8_broken_duplicate_share_fail(self):
209         d = defer.succeed(None)
210         for mutable in [False, True]:
211             d.addCallback(lambda ign: self._set_up(mutable, "test_2_good_8_broken_duplicate_share_fail"))
212             d.addCallback(lambda ign: self._copy_all_shares_from(self.servers[1:2], self.servers[0]))
213             d.addCallback(lambda ign: self._break(self.servers[2:]))
214             d.addCallback(lambda ign: self._should_fail_download())
215         return d
216
217     def test_3_good_7_hung_immutable(self):
218         d = defer.succeed(None)
219         d.addCallback(lambda ign: self._set_up(False, "test_3_good_7_hung"))
220         d.addCallback(lambda ign: self._hang(self.servers[3:]))
221         d.addCallback(lambda ign: self._download_and_check())
222         return d
223
224     def test_5_overdue_immutable(self):
225         # restrict the ShareFinder to only allow 5 outstanding requests, and
226         # arrange for the first 5 servers to hang. Then trigger the OVERDUE
227         # timers (simulating 10 seconds passed), at which point the
228         # ShareFinder should send additional queries and finish the download
229         # quickly. If we didn't have OVERDUE timers, this test would fail by
230         # timing out.
231         done = []
232         d = self._set_up(False, "test_5_overdue_immutable")
233         def _reduce_max_outstanding_requests_and_download(ign):
234             self._hang_shares(range(5))
235             n = self.c0.create_node_from_uri(self.uri)
236             n._cnode._maybe_create_download_node()
237             self._sf = n._cnode._node._sharefinder
238             self._sf.max_outstanding_requests = 5
239             self._sf.OVERDUE_TIMEOUT = 1000.0
240             d2 = download_to_data(n)
241             # start download, but don't wait for it to complete yet
242             def _done(res):
243                 done.append(res) # we will poll for this later
244             d2.addBoth(_done)
245         d.addCallback(_reduce_max_outstanding_requests_and_download)
246         from foolscap.eventual import fireEventually, flushEventualQueue
247         # wait here a while
248         d.addCallback(lambda res: fireEventually(res))
249         d.addCallback(lambda res: flushEventualQueue())
250         d.addCallback(lambda ign: self.failIf(done))
251         def _check_waiting(ign):
252             # all the share requests should now be stuck waiting
253             self.failUnlessEqual(len(self._sf.pending_requests), 5)
254             # but none should be marked as OVERDUE until the timers expire
255             self.failUnlessEqual(len(self._sf.overdue_requests), 0)
256         d.addCallback(_check_waiting)
257         def _mark_overdue(ign):
258             # declare four requests overdue, allowing new requests to take
259             # their place, and leaving one stuck. The finder will keep
260             # sending requests until there are 5 non-overdue ones
261             # outstanding, at which point we'll have 4 OVERDUE, 1
262             # stuck-but-not-overdue, and 4 live requests. All 4 live requests
263             # will retire before the download is complete and the ShareFinder
264             # is shut off. That will leave 4 OVERDUE and 1
265             # stuck-but-not-overdue, for a total of 5 requests in in
266             # _sf.pending_requests
267             for t in self._sf.overdue_timers.values()[:4]:
268                 t.reset(-1.0)
269             # the timers ought to fire before the eventual-send does
270             return fireEventually()
271         d.addCallback(_mark_overdue)
272         def _we_are_done():
273             return bool(done)
274         d.addCallback(lambda ign: self.poll(_we_are_done))
275         def _check_done(ign):
276             self.failUnlessEqual(done, [immutable_plaintext])
277             self.failUnlessEqual(len(self._sf.pending_requests), 5)
278             self.failUnlessEqual(len(self._sf.overdue_requests), 4)
279         d.addCallback(_check_done)
280         return d
281
282     def test_2_good_8_hung_then_1_recovers_immutable(self):
283         d = defer.succeed(None)
284         d.addCallback(lambda ign: self._set_up(False, "test_2_good_8_hung_then_1_recovers"))
285         d.addCallback(lambda ign: self._hang(self.servers[2:3]))
286         d.addCallback(lambda ign: self._hang(self.servers[3:]))
287         d.addCallback(lambda ign: self._unhang(self.servers[2:3]))
288         d.addCallback(lambda ign: self._download_and_check())
289         return d
290
291     def test_2_good_8_hung_then_1_recovers_with_2_shares_immutable(self):
292         d = defer.succeed(None)
293         d.addCallback(lambda ign: self._set_up(False, "test_2_good_8_hung_then_1_recovers_with_2_shares"))
294         d.addCallback(lambda ign: self._copy_all_shares_from(self.servers[0:1], self.servers[2]))
295         d.addCallback(lambda ign: self._hang(self.servers[2:3]))
296         d.addCallback(lambda ign: self._hang(self.servers[3:]))
297         d.addCallback(lambda ign: self._unhang(self.servers[2:3]))
298         d.addCallback(lambda ign: self._download_and_check())
299         return d
300
301     # The tests below do not currently pass for mutable files. The
302     # mutable-file downloader does not yet handle hung servers, and the tests
303     # hang forever (hence the use of SkipTest rather than .todo)
304
305     def test_3_good_7_hung_mutable(self):
306         raise unittest.SkipTest("still broken")
307         d = defer.succeed(None)
308         d.addCallback(lambda ign: self._set_up(True, "test_3_good_7_hung"))
309         d.addCallback(lambda ign: self._hang(self.servers[3:]))
310         d.addCallback(lambda ign: self._download_and_check())
311         return d
312
313     def test_2_good_8_hung_then_1_recovers_mutable(self):
314         raise unittest.SkipTest("still broken")
315         d = defer.succeed(None)
316         d.addCallback(lambda ign: self._set_up(True, "test_2_good_8_hung_then_1_recovers"))
317         d.addCallback(lambda ign: self._hang(self.servers[2:3]))
318         d.addCallback(lambda ign: self._hang(self.servers[3:]))
319         d.addCallback(lambda ign: self._unhang(self.servers[2:3]))
320         d.addCallback(lambda ign: self._download_and_check())
321         return d
322
323     def test_2_good_8_hung_then_1_recovers_with_2_shares_mutable(self):
324         raise unittest.SkipTest("still broken")
325         d = defer.succeed(None)
326         d.addCallback(lambda ign: self._set_up(True, "test_2_good_8_hung_then_1_recovers_with_2_shares"))
327         d.addCallback(lambda ign: self._copy_all_shares_from(self.servers[0:1], self.servers[2]))
328         d.addCallback(lambda ign: self._hang(self.servers[2:3]))
329         d.addCallback(lambda ign: self._hang(self.servers[3:]))
330         d.addCallback(lambda ign: self._unhang(self.servers[2:3]))
331         d.addCallback(lambda ign: self._download_and_check())
332         return d