1 # -*- coding: utf-8 -*-
4 from twisted.trial import unittest
5 from twisted.internet import defer
6 from allmydata import uri
7 from allmydata.util.consumer import download_to_data
8 from allmydata.immutable import upload
9 from allmydata.mutable.common import UnrecoverableFileError
10 from allmydata.storage.common import storage_index_to_dir
11 from allmydata.test.no_network import GridTestMixin
12 from allmydata.test.common import ShouldFailMixin
13 from allmydata.util.pollmixin import PollMixin
14 from allmydata.interfaces import NotEnoughSharesError
16 immutable_plaintext = "data" * 10000
17 mutable_plaintext = "muta" * 10000
19 class HungServerDownloadTest(GridTestMixin, ShouldFailMixin, PollMixin,
21 # Many of these tests take around 60 seconds on François's ARM buildslave:
22 # http://tahoe-lafs.org/buildbot/builders/FranXois%20lenny-armv5tel
23 # allmydata.test.test_hung_server.HungServerDownloadTest.test_2_good_8_broken_duplicate_share_fail
24 # once ERRORed after 197 seconds on Midnight Magic's NetBSD buildslave:
25 # http://tahoe-lafs.org/buildbot/builders/MM%20netbsd4%20i386%20warp
26 # MM's buildslave varies a lot in how long it takes to run tests.
30 def _break(self, servers):
31 for (id, ss) in servers:
32 self.g.break_server(id)
34 def _hang(self, servers, **kwargs):
35 for (id, ss) in servers:
36 self.g.hang_server(id, **kwargs)
38 def _unhang(self, servers, **kwargs):
39 for (id, ss) in servers:
40 self.g.unhang_server(id, **kwargs)
42 def _hang_shares(self, shnums, **kwargs):
43 # hang all servers who are holding the given shares
44 hung_serverids = set()
45 for (i_shnum, i_serverid, i_sharefile) in self.shares:
47 if i_serverid not in hung_serverids:
48 self.g.hang_server(i_serverid, **kwargs)
49 hung_serverids.add(i_serverid)
51 def _delete_all_shares_from(self, servers):
52 serverids = [id for (id, ss) in servers]
53 for (i_shnum, i_serverid, i_sharefile) in self.shares:
54 if i_serverid in serverids:
55 os.unlink(i_sharefile)
57 def _corrupt_all_shares_in(self, servers, corruptor_func):
58 serverids = [id for (id, ss) in servers]
59 for (i_shnum, i_serverid, i_sharefile) in self.shares:
60 if i_serverid in serverids:
61 self._corrupt_share((i_shnum, i_sharefile), corruptor_func)
63 def _copy_all_shares_from(self, from_servers, to_server):
64 serverids = [id for (id, ss) in from_servers]
65 for (i_shnum, i_serverid, i_sharefile) in self.shares:
66 if i_serverid in serverids:
67 self._copy_share((i_shnum, i_sharefile), to_server)
69 def _copy_share(self, share, to_server):
70 (sharenum, sharefile) = share
72 shares_dir = os.path.join(ss.original.storedir, "shares")
73 si = uri.from_string(self.uri).get_storage_index()
74 si_dir = os.path.join(shares_dir, storage_index_to_dir(si))
75 if not os.path.exists(si_dir):
77 new_sharefile = os.path.join(si_dir, str(sharenum))
78 shutil.copy(sharefile, new_sharefile)
79 self.shares = self.find_uri_shares(self.uri)
80 # Make sure that the storage server has the share.
81 self.failUnless((sharenum, ss.original.my_nodeid, new_sharefile)
84 def _corrupt_share(self, share, corruptor_func):
85 (sharenum, sharefile) = share
86 data = open(sharefile, "rb").read()
87 newdata = corruptor_func(data)
89 wf = open(sharefile, "wb")
93 def _set_up(self, mutable, testdir, num_clients=1, num_servers=10):
94 self.mutable = mutable
96 self.basedir = "hung_server/mutable_" + testdir
98 self.basedir = "hung_server/immutable_" + testdir
100 self.set_up_grid(num_clients=num_clients, num_servers=num_servers)
102 self.c0 = self.g.clients[0]
103 nm = self.c0.nodemaker
104 self.servers = sorted([(id, ss) for (id, ss) in nm.storage_broker.get_all_servers()])
105 self.servers = self.servers[5:] + self.servers[:5]
108 d = nm.create_mutable_file(mutable_plaintext)
109 def _uploaded_mutable(node):
110 self.uri = node.get_uri()
111 self.shares = self.find_uri_shares(self.uri)
112 d.addCallback(_uploaded_mutable)
114 data = upload.Data(immutable_plaintext, convergence="")
115 d = self.c0.upload(data)
116 def _uploaded_immutable(upload_res):
117 self.uri = upload_res.uri
118 self.shares = self.find_uri_shares(self.uri)
119 d.addCallback(_uploaded_immutable)
122 def _start_download(self):
123 n = self.c0.create_node_from_uri(self.uri)
125 d = n.download_best_version()
127 d = download_to_data(n)
130 def _wait_for_data(self, n):
132 d = n.download_best_version()
134 d = download_to_data(n)
137 def _check(self, resultingdata):
139 self.failUnlessEqual(resultingdata, mutable_plaintext)
141 self.failUnlessEqual(resultingdata, immutable_plaintext)
143 def _download_and_check(self):
144 d = self._start_download()
145 d.addCallback(self._check)
148 def _should_fail_download(self):
150 return self.shouldFail(UnrecoverableFileError, self.basedir,
151 "no recoverable versions",
152 self._download_and_check)
154 return self.shouldFail(NotEnoughSharesError, self.basedir,
156 self._download_and_check)
159 def test_10_good_sanity_check(self):
160 d = defer.succeed(None)
161 for mutable in [False, True]:
162 d.addCallback(lambda ign: self._set_up(mutable, "test_10_good_sanity_check"))
163 d.addCallback(lambda ign: self._download_and_check())
166 def test_10_good_copied_share(self):
167 d = defer.succeed(None)
168 for mutable in [False, True]:
169 d.addCallback(lambda ign: self._set_up(mutable, "test_10_good_copied_share"))
170 d.addCallback(lambda ign: self._copy_all_shares_from(self.servers[2:3], self.servers[0]))
171 d.addCallback(lambda ign: self._download_and_check())
174 def test_3_good_7_noshares(self):
175 d = defer.succeed(None)
176 for mutable in [False, True]:
177 d.addCallback(lambda ign: self._set_up(mutable, "test_3_good_7_noshares"))
178 d.addCallback(lambda ign: self._delete_all_shares_from(self.servers[3:]))
179 d.addCallback(lambda ign: self._download_and_check())
182 def test_2_good_8_broken_fail(self):
183 d = defer.succeed(None)
184 for mutable in [False, True]:
185 d.addCallback(lambda ign: self._set_up(mutable, "test_2_good_8_broken_fail"))
186 d.addCallback(lambda ign: self._break(self.servers[2:]))
187 d.addCallback(lambda ign: self._should_fail_download())
190 def test_2_good_8_noshares_fail(self):
191 d = defer.succeed(None)
192 for mutable in [False, True]:
193 d.addCallback(lambda ign: self._set_up(mutable, "test_2_good_8_noshares_fail"))
194 d.addCallback(lambda ign: self._delete_all_shares_from(self.servers[2:]))
195 d.addCallback(lambda ign: self._should_fail_download())
198 def test_2_good_8_broken_copied_share(self):
199 d = defer.succeed(None)
200 for mutable in [False, True]:
201 d.addCallback(lambda ign: self._set_up(mutable, "test_2_good_8_broken_copied_share"))
202 d.addCallback(lambda ign: self._copy_all_shares_from(self.servers[2:3], self.servers[0]))
203 d.addCallback(lambda ign: self._break(self.servers[2:]))
204 d.addCallback(lambda ign: self._download_and_check())
207 def test_2_good_8_broken_duplicate_share_fail(self):
208 d = defer.succeed(None)
209 for mutable in [False, True]:
210 d.addCallback(lambda ign: self._set_up(mutable, "test_2_good_8_broken_duplicate_share_fail"))
211 d.addCallback(lambda ign: self._copy_all_shares_from(self.servers[1:2], self.servers[0]))
212 d.addCallback(lambda ign: self._break(self.servers[2:]))
213 d.addCallback(lambda ign: self._should_fail_download())
216 def test_3_good_7_hung_immutable(self):
217 d = defer.succeed(None)
218 d.addCallback(lambda ign: self._set_up(False, "test_3_good_7_hung"))
219 d.addCallback(lambda ign: self._hang(self.servers[3:]))
220 d.addCallback(lambda ign: self._download_and_check())
223 def test_5_overdue_immutable(self):
224 # restrict the ShareFinder to only allow 5 outstanding requests, and
225 # arrange for the first 5 servers to hang. Then trigger the OVERDUE
226 # timers (simulating 10 seconds passed), at which point the
227 # ShareFinder should send additional queries and finish the download
228 # quickly. If we didn't have OVERDUE timers, this test would fail by
231 d = self._set_up(False, "test_5_overdue_immutable")
232 def _reduce_max_outstanding_requests_and_download(ign):
233 self._hang_shares(range(5))
234 n = self.c0.create_node_from_uri(self.uri)
235 n._cnode._maybe_create_download_node()
236 self._sf = n._cnode._node._sharefinder
237 self._sf.max_outstanding_requests = 5
238 self._sf.OVERDUE_TIMEOUT = 1000.0
239 d2 = download_to_data(n)
240 # start download, but don't wait for it to complete yet
242 done.append(res) # we will poll for this later
244 d.addCallback(_reduce_max_outstanding_requests_and_download)
245 from foolscap.eventual import fireEventually, flushEventualQueue
247 d.addCallback(lambda res: fireEventually(res))
248 d.addCallback(lambda res: flushEventualQueue())
249 d.addCallback(lambda ign: self.failIf(done))
250 def _check_waiting(ign):
251 # all the share requests should now be stuck waiting
252 self.failUnlessEqual(len(self._sf.pending_requests), 5)
253 # but none should be marked as OVERDUE until the timers expire
254 self.failUnlessEqual(len(self._sf.overdue_requests), 0)
255 d.addCallback(_check_waiting)
256 def _mark_overdue(ign):
257 # declare four requests overdue, allowing new requests to take
258 # their place, and leaving one stuck. The finder will keep
259 # sending requests until there are 5 non-overdue ones
260 # outstanding, at which point we'll have 4 OVERDUE, 1
261 # stuck-but-not-overdue, and 4 live requests. All 4 live requests
262 # will retire before the download is complete and the ShareFinder
263 # is shut off. That will leave 4 OVERDUE and 1
264 # stuck-but-not-overdue, for a total of 5 requests in in
265 # _sf.pending_requests
266 for t in self._sf.overdue_timers.values()[:4]:
268 # the timers ought to fire before the eventual-send does
269 return fireEventually()
270 d.addCallback(_mark_overdue)
273 d.addCallback(lambda ign: self.poll(_we_are_done))
274 def _check_done(ign):
275 self.failUnlessEqual(done, [immutable_plaintext])
276 self.failUnlessEqual(len(self._sf.pending_requests), 5)
277 self.failUnlessEqual(len(self._sf.overdue_requests), 4)
278 d.addCallback(_check_done)
281 def test_2_good_8_hung_then_1_recovers_immutable(self):
282 d = defer.succeed(None)
283 d.addCallback(lambda ign: self._set_up(False, "test_2_good_8_hung_then_1_recovers"))
284 d.addCallback(lambda ign: self._hang(self.servers[2:3]))
285 d.addCallback(lambda ign: self._hang(self.servers[3:]))
286 d.addCallback(lambda ign: self._unhang(self.servers[2:3]))
287 d.addCallback(lambda ign: self._download_and_check())
290 def test_2_good_8_hung_then_1_recovers_with_2_shares_immutable(self):
291 d = defer.succeed(None)
292 d.addCallback(lambda ign: self._set_up(False, "test_2_good_8_hung_then_1_recovers_with_2_shares"))
293 d.addCallback(lambda ign: self._copy_all_shares_from(self.servers[0:1], self.servers[2]))
294 d.addCallback(lambda ign: self._hang(self.servers[2:3]))
295 d.addCallback(lambda ign: self._hang(self.servers[3:]))
296 d.addCallback(lambda ign: self._unhang(self.servers[2:3]))
297 d.addCallback(lambda ign: self._download_and_check())
300 # The tests below do not currently pass for mutable files. The
301 # mutable-file downloader does not yet handle hung servers, and the tests
302 # hang forever (hence the use of SkipTest rather than .todo)
304 def test_3_good_7_hung_mutable(self):
305 raise unittest.SkipTest("still broken")
306 d = defer.succeed(None)
307 d.addCallback(lambda ign: self._set_up(True, "test_3_good_7_hung"))
308 d.addCallback(lambda ign: self._hang(self.servers[3:]))
309 d.addCallback(lambda ign: self._download_and_check())
312 def test_2_good_8_hung_then_1_recovers_mutable(self):
313 raise unittest.SkipTest("still broken")
314 d = defer.succeed(None)
315 d.addCallback(lambda ign: self._set_up(True, "test_2_good_8_hung_then_1_recovers"))
316 d.addCallback(lambda ign: self._hang(self.servers[2:3]))
317 d.addCallback(lambda ign: self._hang(self.servers[3:]))
318 d.addCallback(lambda ign: self._unhang(self.servers[2:3]))
319 d.addCallback(lambda ign: self._download_and_check())
322 def test_2_good_8_hung_then_1_recovers_with_2_shares_mutable(self):
323 raise unittest.SkipTest("still broken")
324 d = defer.succeed(None)
325 d.addCallback(lambda ign: self._set_up(True, "test_2_good_8_hung_then_1_recovers_with_2_shares"))
326 d.addCallback(lambda ign: self._copy_all_shares_from(self.servers[0:1], self.servers[2]))
327 d.addCallback(lambda ign: self._hang(self.servers[2:3]))
328 d.addCallback(lambda ign: self._hang(self.servers[3:]))
329 d.addCallback(lambda ign: self._unhang(self.servers[2:3]))
330 d.addCallback(lambda ign: self._download_and_check())