node.py: add BASEDIR/keepalive_timeout and BASEDIR/disconnect_timeout, to set/enable...
authorBrian Warner <warner@allmydata.com>
Wed, 24 Sep 2008 17:51:12 +0000 (10:51 -0700)
committerBrian Warner <warner@allmydata.com>
Wed, 24 Sep 2008 17:51:12 +0000 (10:51 -0700)
docs/configuration.txt
src/allmydata/node.py
src/allmydata/test/common.py

index 4826615a474c156dcde4603574f91f0b92f0f295..1775b18dd36908fe74e053717e4a687f77112bd1 100644 (file)
@@ -57,6 +57,25 @@ specification, like:
 Lines that do not provide a port number will use the same client.port as the
 automatically-discovered addresses.
 
+keepalive_timeout (optional): If present, this is treated as an integral
+number of seconds, and sets the Foolscap "keepalive timer" to that value. For
+each connection to another node, if nothing has been heard for a while, we
+will attempt to provoke the other end into saying something. The duration of
+silence that passes before sending the PING will be between KT and 2*KT. This
+is mainly intended to keep NAT boxes from expiring idle TCP sessions, but
+also gives TCP's long-duration keepalive/disconnect timers some traffic to
+work with. The default value is 240 (i.e. 4 minutes).
+
+disconnect_timeout (optional): If present, this is treated as an integral
+number of seconds, and sets the Foolscap "disconnect timer" to that value.
+For each connection to another node, if nothing has been heard for a while,
+we will drop the connection. The duration of silence that passes before
+dropping the connection will be between DT-2*KT and 2*DT+2*KT (please see
+ticket #521 for more details). If we are sending a large amount of data to
+the other end (which takes more than DT-2*KT to deliver), we might
+incorrectly drop the connection. The default behavior (when this file does
+not exist) is to disable the disconnect timer.
+
 authorized_keys.SSHPORT (optional): This enables an SSH-based interactive
 Python shell, which can be used to inspect the internal state of the node,
 for debugging.  To cause the node to accept SSH connections on port 8022,
index c2aad6eda21b16575b2467de61840d77be2da7bc..2f81ab194a25a8d755740c4c451a7460fd4c66b2 100644 (file)
@@ -56,6 +56,22 @@ class Node(service.MultiService):
         self.tub = Tub(certFile=certfile)
         self.tub.setOption("logLocalFailures", True)
         self.tub.setOption("logRemoteFailures", True)
+
+        # see #521 for a discussion of how to pick these timeout values. Using
+        # 30 minutes means we'll disconnect after 22 to 68 minutes of
+        # inactivity. Receiving data will reset this timeout, however if we
+        # have more than 22min of data in the outbound queue (such as 800kB
+        # in two pipelined segments of 10 shares each) and the far end has no
+        # need to contact us, our ping might be delayed, so we may disconnect
+        # them by accident.
+        keepalive_timeout_s = self.get_config("keepalive_timeout")
+        if keepalive_timeout_s:
+            self.tub.setOption("keepaliveTimeout", int(keepalive_timeout_s))
+        disconnect_timeout_s = self.get_config("disconnect_timeout")
+        if disconnect_timeout_s:
+            # N.B.: this is in seconds, so use "1800" to get 30min
+            self.tub.setOption("disconnectTimeout", int(disconnect_timeout_s))
+
         self.nodeid = b32decode(self.tub.tubID.upper()) # binary format
         self.write_config("my_nodeid", b32encode(self.nodeid).lower() + "\n")
         self.short_nodeid = b32encode(self.nodeid).lower()[:8] # ready for printing
index 14eef8f48c8efa58730b1042810d930332ad8115..0e06b82ce29534acbfe9953e0704b0dabd60b5be 100644 (file)
@@ -347,20 +347,25 @@ class SystemTestMixin(testutil.PollMixin, testutil.StallMixin):
                 f.write(SYSTEM_TEST_CERTS[i+1])
                 f.close()
 
+            def write(name, value):
+                open(os.path.join(basedir, name), "w").write(value+"\n")
             if i == 0:
                 # client[0] runs a webserver and a helper, no key_generator
-                open(os.path.join(basedir, "webport"), "w").write("tcp:0:interface=127.0.0.1")
-                open(os.path.join(basedir, "run_helper"), "w").write("yes\n")
-                open(os.path.join(basedir, "sizelimit"), "w").write("10GB\n")
+                write("webport", "tcp:0:interface=127.0.0.1")
+                write("run_helper", "yes")
+                write("sizelimit", "10GB")
+                write("keepalive_timeout", "600")
             if i == 3:
-                # client[3] runs a webserver and uses a helper, uses key_generator
-                open(os.path.join(basedir, "webport"), "w").write("tcp:0:interface=127.0.0.1")
+                # client[3] runs a webserver and uses a helper, uses
+                # key_generator
+                write("webport", "tcp:0:interface=127.0.0.1")
+                write("disconnect_timeout", "1800")
                 if self.key_generator_furl:
                     kgf = "%s\n" % (self.key_generator_furl,)
-                    open(os.path.join(basedir, "key_generator.furl"), "w").write(kgf)
-            open(os.path.join(basedir, "introducer.furl"), "w").write(self.introducer_furl)
+                    write("key_generator.furl", kgf)
+            write("introducer.furl", self.introducer_furl)
             if self.stats_gatherer_furl:
-                open(os.path.join(basedir, "stats_gatherer.furl"), "w").write(self.stats_gatherer_furl)
+                write("stats_gatherer.furl", self.stats_gatherer_furl)
 
         # start client[0], wait for it's tub to be ready (at which point it
         # will have registered the helper furl).