From dd9171eb729d5dc28d8f83ee8340fc3f41f3779a Mon Sep 17 00:00:00 2001
From: Brian Warner <warner@allmydata.com>
Date: Wed, 24 Sep 2008 10:51:12 -0700
Subject: [PATCH] node.py: add BASEDIR/keepalive_timeout and
 BASEDIR/disconnect_timeout, to set/enable the foolscap timers, for #521

---
 docs/configuration.txt       | 19 +++++++++++++++++++
 src/allmydata/node.py        | 16 ++++++++++++++++
 src/allmydata/test/common.py | 21 +++++++++++++--------
 3 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/docs/configuration.txt b/docs/configuration.txt
index 4826615a..1775b18d 100644
--- a/docs/configuration.txt
+++ b/docs/configuration.txt
@@ -57,6 +57,25 @@ specification, like:
 Lines that do not provide a port number will use the same client.port as the
 automatically-discovered addresses.
 
+keepalive_timeout (optional): If present, this is treated as an integral
+number of seconds, and sets the Foolscap "keepalive timer" to that value. For
+each connection to another node, if nothing has been heard for a while, we
+will attempt to provoke the other end into saying something. The duration of
+silence that passes before sending the PING will be between KT and 2*KT. This
+is mainly intended to keep NAT boxes from expiring idle TCP sessions, but
+also gives TCP's long-duration keepalive/disconnect timers some traffic to
+work with. The default value is 240 (i.e. 4 minutes).
+
+disconnect_timeout (optional): If present, this is treated as an integral
+number of seconds, and sets the Foolscap "disconnect timer" to that value.
+For each connection to another node, if nothing has been heard for a while,
+we will drop the connection. The duration of silence that passes before
+dropping the connection will be between DT-2*KT and 2*DT+2*KT (please see
+ticket #521 for more details). If we are sending a large amount of data to
+the other end (which takes more than DT-2*KT to deliver), we might
+incorrectly drop the connection. The default behavior (when this file does
+not exist) is to disable the disconnect timer.
+
 authorized_keys.SSHPORT (optional): This enables an SSH-based interactive
 Python shell, which can be used to inspect the internal state of the node,
 for debugging.  To cause the node to accept SSH connections on port 8022,
diff --git a/src/allmydata/node.py b/src/allmydata/node.py
index c2aad6ed..2f81ab19 100644
--- a/src/allmydata/node.py
+++ b/src/allmydata/node.py
@@ -56,6 +56,22 @@ class Node(service.MultiService):
         self.tub = Tub(certFile=certfile)
         self.tub.setOption("logLocalFailures", True)
         self.tub.setOption("logRemoteFailures", True)
+
+        # see #521 for a discussion of how to pick these timeout values. Using
+        # 30 minutes means we'll disconnect after 22 to 68 minutes of
+        # inactivity. Receiving data will reset this timeout, however if we
+        # have more than 22min of data in the outbound queue (such as 800kB
+        # in two pipelined segments of 10 shares each) and the far end has no
+        # need to contact us, our ping might be delayed, so we may disconnect
+        # them by accident.
+        keepalive_timeout_s = self.get_config("keepalive_timeout")
+        if keepalive_timeout_s:
+            self.tub.setOption("keepaliveTimeout", int(keepalive_timeout_s))
+        disconnect_timeout_s = self.get_config("disconnect_timeout")
+        if disconnect_timeout_s:
+            # N.B.: this is in seconds, so use "1800" to get 30min
+            self.tub.setOption("disconnectTimeout", int(disconnect_timeout_s))
+
         self.nodeid = b32decode(self.tub.tubID.upper()) # binary format
         self.write_config("my_nodeid", b32encode(self.nodeid).lower() + "\n")
         self.short_nodeid = b32encode(self.nodeid).lower()[:8] # ready for printing
diff --git a/src/allmydata/test/common.py b/src/allmydata/test/common.py
index 14eef8f4..0e06b82c 100644
--- a/src/allmydata/test/common.py
+++ b/src/allmydata/test/common.py
@@ -347,20 +347,25 @@ class SystemTestMixin(testutil.PollMixin, testutil.StallMixin):
                 f.write(SYSTEM_TEST_CERTS[i+1])
                 f.close()
 
+            def write(name, value):
+                open(os.path.join(basedir, name), "w").write(value+"\n")
             if i == 0:
                 # client[0] runs a webserver and a helper, no key_generator
-                open(os.path.join(basedir, "webport"), "w").write("tcp:0:interface=127.0.0.1")
-                open(os.path.join(basedir, "run_helper"), "w").write("yes\n")
-                open(os.path.join(basedir, "sizelimit"), "w").write("10GB\n")
+                write("webport", "tcp:0:interface=127.0.0.1")
+                write("run_helper", "yes")
+                write("sizelimit", "10GB")
+                write("keepalive_timeout", "600")
             if i == 3:
-                # client[3] runs a webserver and uses a helper, uses key_generator
-                open(os.path.join(basedir, "webport"), "w").write("tcp:0:interface=127.0.0.1")
+                # client[3] runs a webserver and uses a helper, uses
+                # key_generator
+                write("webport", "tcp:0:interface=127.0.0.1")
+                write("disconnect_timeout", "1800")
                 if self.key_generator_furl:
                     kgf = "%s\n" % (self.key_generator_furl,)
-                    open(os.path.join(basedir, "key_generator.furl"), "w").write(kgf)
-            open(os.path.join(basedir, "introducer.furl"), "w").write(self.introducer_furl)
+                    write("key_generator.furl", kgf)
+            write("introducer.furl", self.introducer_furl)
             if self.stats_gatherer_furl:
-                open(os.path.join(basedir, "stats_gatherer.furl"), "w").write(self.stats_gatherer_furl)
+                write("stats_gatherer.furl", self.stats_gatherer_furl)
 
         # start client[0], wait for it's tub to be ready (at which point it
         # will have registered the helper furl).
-- 
2.45.2