From: Brian Warner Date: Wed, 24 Sep 2008 17:51:12 +0000 (-0700) Subject: node.py: add BASEDIR/keepalive_timeout and BASEDIR/disconnect_timeout, to set/enable... X-Git-Url: https://git.rkrishnan.org/vdrive/%22file:/frontends/reliability?a=commitdiff_plain;h=dd9171eb729d5dc28d8f83ee8340fc3f41f3779a;p=tahoe-lafs%2Ftahoe-lafs.git node.py: add BASEDIR/keepalive_timeout and BASEDIR/disconnect_timeout, to set/enable the foolscap timers, for #521 --- diff --git a/docs/configuration.txt b/docs/configuration.txt index 4826615a..1775b18d 100644 --- a/docs/configuration.txt +++ b/docs/configuration.txt @@ -57,6 +57,25 @@ specification, like: Lines that do not provide a port number will use the same client.port as the automatically-discovered addresses. +keepalive_timeout (optional): If present, this is treated as an integral +number of seconds, and sets the Foolscap "keepalive timer" to that value. For +each connection to another node, if nothing has been heard for a while, we +will attempt to provoke the other end into saying something. The duration of +silence that passes before sending the PING will be between KT and 2*KT. This +is mainly intended to keep NAT boxes from expiring idle TCP sessions, but +also gives TCP's long-duration keepalive/disconnect timers some traffic to +work with. The default value is 240 (i.e. 4 minutes). + +disconnect_timeout (optional): If present, this is treated as an integral +number of seconds, and sets the Foolscap "disconnect timer" to that value. +For each connection to another node, if nothing has been heard for a while, +we will drop the connection. The duration of silence that passes before +dropping the connection will be between DT-2*KT and 2*DT+2*KT (please see +ticket #521 for more details). If we are sending a large amount of data to +the other end (which takes more than DT-2*KT to deliver), we might +incorrectly drop the connection. The default behavior (when this file does +not exist) is to disable the disconnect timer. + authorized_keys.SSHPORT (optional): This enables an SSH-based interactive Python shell, which can be used to inspect the internal state of the node, for debugging. To cause the node to accept SSH connections on port 8022, diff --git a/src/allmydata/node.py b/src/allmydata/node.py index c2aad6ed..2f81ab19 100644 --- a/src/allmydata/node.py +++ b/src/allmydata/node.py @@ -56,6 +56,22 @@ class Node(service.MultiService): self.tub = Tub(certFile=certfile) self.tub.setOption("logLocalFailures", True) self.tub.setOption("logRemoteFailures", True) + + # see #521 for a discussion of how to pick these timeout values. Using + # 30 minutes means we'll disconnect after 22 to 68 minutes of + # inactivity. Receiving data will reset this timeout, however if we + # have more than 22min of data in the outbound queue (such as 800kB + # in two pipelined segments of 10 shares each) and the far end has no + # need to contact us, our ping might be delayed, so we may disconnect + # them by accident. + keepalive_timeout_s = self.get_config("keepalive_timeout") + if keepalive_timeout_s: + self.tub.setOption("keepaliveTimeout", int(keepalive_timeout_s)) + disconnect_timeout_s = self.get_config("disconnect_timeout") + if disconnect_timeout_s: + # N.B.: this is in seconds, so use "1800" to get 30min + self.tub.setOption("disconnectTimeout", int(disconnect_timeout_s)) + self.nodeid = b32decode(self.tub.tubID.upper()) # binary format self.write_config("my_nodeid", b32encode(self.nodeid).lower() + "\n") self.short_nodeid = b32encode(self.nodeid).lower()[:8] # ready for printing diff --git a/src/allmydata/test/common.py b/src/allmydata/test/common.py index 14eef8f4..0e06b82c 100644 --- a/src/allmydata/test/common.py +++ b/src/allmydata/test/common.py @@ -347,20 +347,25 @@ class SystemTestMixin(testutil.PollMixin, testutil.StallMixin): f.write(SYSTEM_TEST_CERTS[i+1]) f.close() + def write(name, value): + open(os.path.join(basedir, name), "w").write(value+"\n") if i == 0: # client[0] runs a webserver and a helper, no key_generator - open(os.path.join(basedir, "webport"), "w").write("tcp:0:interface=127.0.0.1") - open(os.path.join(basedir, "run_helper"), "w").write("yes\n") - open(os.path.join(basedir, "sizelimit"), "w").write("10GB\n") + write("webport", "tcp:0:interface=127.0.0.1") + write("run_helper", "yes") + write("sizelimit", "10GB") + write("keepalive_timeout", "600") if i == 3: - # client[3] runs a webserver and uses a helper, uses key_generator - open(os.path.join(basedir, "webport"), "w").write("tcp:0:interface=127.0.0.1") + # client[3] runs a webserver and uses a helper, uses + # key_generator + write("webport", "tcp:0:interface=127.0.0.1") + write("disconnect_timeout", "1800") if self.key_generator_furl: kgf = "%s\n" % (self.key_generator_furl,) - open(os.path.join(basedir, "key_generator.furl"), "w").write(kgf) - open(os.path.join(basedir, "introducer.furl"), "w").write(self.introducer_furl) + write("key_generator.furl", kgf) + write("introducer.furl", self.introducer_furl) if self.stats_gatherer_furl: - open(os.path.join(basedir, "stats_gatherer.furl"), "w").write(self.stats_gatherer_furl) + write("stats_gatherer.furl", self.stats_gatherer_furl) # start client[0], wait for it's tub to be ready (at which point it # will have registered the helper furl).