Proactively restart the SPTPS tunnel if we get receive errors.
authorEtienne Dechamps <etienne@edechamps.fr>
Sun, 17 May 2015 17:50:11 +0000 (18:50 +0100)
committerEtienne Dechamps <etienne@edechamps.fr>
Sun, 17 May 2015 18:21:50 +0000 (19:21 +0100)
There are a number of ways a SPTPS tunnel can get into a corrupt state.
For example, during key regeneration, the KEX and SIG messages from
other nodes might arrive out of order, which confuses the hell out of
the SPTPS code. Another possible scenario is not noticing another node
crashed and restarted because there was no point in time where the node
was seen completely disconnected from *all* nodes; this could result in
using the wrong (old) key. There are probably other scenarios which have
not even been considered yet. Distributed systems are hard.

When SPTPS got confused by a packet, it used to crash the entire
process; fortunately that was fixed by commit
2e7f68ad2b51648b89c4b5c61aeb4cec67c2fbbb. However, the error handling
(or lack thereof) leaves a lot to be desired. Currently, when SPTPS
encounters an error when receiving a packet, it just shrugs it off and
continues as if nothing happened. The problem is, sometimes getting
receive errors mean the tunnel is completely stuck and will not recover
on its own. In that case, the node will become unreachable - possibly
indefinitely.

The goal of this commit is to improve SPTPS error handling by taking
proactive action when an incoming packet triggers a failure, which is
often an indicator that the tunnel is stuck in some way. When that
happens, we simply restart SPTPS entirely, which should make the tunnel
recover quickly.

To prevent "storms" where two buggy nodes flood each other with invalid
packets and therefore spend all their time negotiating new tunnels, we
limit the frequency at which tunnel restarts happen to ten seconds.

It is likely this commit will solve the "Invalid KEX record length
during key regeneration" issue that has been seen in the wild. It is
difficult to be sure though because we do not have a full understanding
of all the possible conditions that can trigger this problem.

src/net_packet.c
src/protocol_key.c

index 92a3eb5..8313a54 100644 (file)
@@ -288,7 +288,14 @@ static bool receive_udppacket(node_t *n, vpn_packet_t *inpkt) {
                n->status.udppacket = false;
 
                if(!result) {
-                       logger(DEBUG_TRAFFIC, LOG_ERR, "Got bad packet from %s (%s)", n->name, n->hostname);
+                       /* Uh-oh. It might be that the tunnel is stuck in some corrupted state,
+                          so let's restart SPTPS in case that helps. But don't do that too often
+                          to prevent storms, and because that would make life a little too easy
+                          for external attackers trying to DoS us. */
+                       if(n->last_req_key < now.tv_sec - 10) {
+                               logger(DEBUG_PROTOCOL, LOG_ERR, "Failed to decode raw TCP packet from %s (%s), restarting SPTPS", n->name, n->hostname);
+                               send_req_key(n);
+                       }
                        return false;
                }
                return true;
@@ -464,11 +471,17 @@ bool receive_tcppacket_sptps(connection_t *c, const char *data, int len) {
 
        /* The packet is for us */
 
-       if(!from->status.validkey) {
-               logger(DEBUG_PROTOCOL, LOG_ERR, "Got SPTPS packet from %s (%s) but we don't have a valid key yet", from->name, from->hostname);
+       if(!sptps_receive_data(&from->sptps, data, len)) {
+               /* Uh-oh. It might be that the tunnel is stuck in some corrupted state,
+                  so let's restart SPTPS in case that helps. But don't do that too often
+                  to prevent storms. */
+               if(from->last_req_key < now.tv_sec - 10) {
+                       logger(DEBUG_PROTOCOL, LOG_ERR, "Failed to decode raw TCP packet from %s (%s), restarting SPTPS", from->name, from->hostname);
+                       send_req_key(from);
+               }
                return true;
        }
-       sptps_receive_data(&from->sptps, data, len);
+
        send_mtu_info(myself, from, MTU);
        return true;
 }
index c183ac4..83851f2 100644 (file)
@@ -107,9 +107,6 @@ bool send_req_key(node_t *to) {
                        return true;
                }
 
-               if(to->sptps.label)
-                       logger(DEBUG_ALWAYS, LOG_DEBUG, "send_req_key(%s) called while sptps->label != NULL!", to->name);
-
                char label[25 + strlen(myself->name) + strlen(to->name)];
                snprintf(label, sizeof label, "tinc UDP key expansion %s %s", myself->name, to->name);
                sptps_stop(&to->sptps);
@@ -148,11 +145,16 @@ static bool req_key_ext_h(connection_t *c, const char *request, node_t *from, no
                        try_tx(to, true);
                } else {
                        /* The packet is for us */
-                       if(!from->status.validkey) {
-                               logger(DEBUG_PROTOCOL, LOG_ERR, "Got SPTPS_PACKET from %s (%s) but we don't have a valid key yet", from->name, from->hostname);
+                       if(!sptps_receive_data(&from->sptps, buf, len)) {
+                               /* Uh-oh. It might be that the tunnel is stuck in some corrupted state,
+                                  so let's restart SPTPS in case that helps. But don't do that too often
+                                  to prevent storms. */
+                               if(from->last_req_key < now.tv_sec - 10) {
+                                       logger(DEBUG_PROTOCOL, LOG_ERR, "Failed to decode TCP packet from %s (%s), restarting SPTPS", from->name, from->hostname);
+                                       send_req_key(from);
+                               }
                                return true;
                        }
-                       sptps_receive_data(&from->sptps, buf, len);
                        send_mtu_info(myself, from, MTU);
                }
 
@@ -428,9 +430,18 @@ bool ans_key_h(connection_t *c, const char *request) {
        if(from->status.sptps) {
                char buf[strlen(key)];
                int len = b64decode(key, buf, strlen(key));
-
-               if(!len || !sptps_receive_data(&from->sptps, buf, len))
-                       logger(DEBUG_ALWAYS, LOG_ERR, "Error processing SPTPS data from %s (%s)", from->name, from->hostname);
+               if(!len || !sptps_receive_data(&from->sptps, buf, len)) {
+                       /* Uh-oh. It might be that the tunnel is stuck in some corrupted state,
+                          so let's restart SPTPS in case that helps. But don't do that too often
+                          to prevent storms.
+                          Note that simply relying on handshake timeout is not enough, because
+                          that doesn't apply to key regeneration. */
+                       if(from->last_req_key < now.tv_sec - 10) {
+                               logger(DEBUG_PROTOCOL, LOG_ERR, "Failed to decode handshake TCP packet from %s (%s), restarting SPTPS", from->name, from->hostname);
+                               send_req_key(from);
+                       }
+                       return true;
+               }
 
                if(from->status.validkey) {
                        if(*address && *port) {