From 23a782bb92d5b62990ee1b0fc3f5b47a94d609ce Mon Sep 17 00:00:00 2001 From: Jana Radhakrishnan Date: Tue, 27 Sep 2016 00:12:07 -0700 Subject: [PATCH] Avoid returning early on agent join failures When a gossip join failure happens do not return early in the call chain because a join failure is most likely transient and the retry logic built in the networkdb is going to retry and succeed. Returning early makes the initialization of ingress network/sandbox to not happen which causes a problem even after the gossip join on retry is successful. Signed-off-by: Jana Radhakrishnan --- agent.go | 3 +-- networkdb/cluster.go | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/agent.go b/agent.go index fb0c342257..4c8980b2e1 100644 --- a/agent.go +++ b/agent.go @@ -191,8 +191,7 @@ func (c *controller) agentSetup() error { if remoteAddr != "" { if err := c.agentJoin(remoteAddr); err != nil { - logrus.Errorf("Error in agentJoin : %v", err) - return nil + logrus.Errorf("Error in joining gossip cluster : %v(join will be retried in background)", err) } } diff --git a/networkdb/cluster.go b/networkdb/cluster.go index 3b624c9a27..c3bfdd4051 100644 --- a/networkdb/cluster.go +++ b/networkdb/cluster.go @@ -161,6 +161,10 @@ func (nDB *NetworkDB) retryJoin(members []string, stop <-chan struct{}) { logrus.Errorf("Failed to join memberlist %s on retry: %v", members, err) continue } + if err := nDB.sendNodeEvent(NodeEventTypeJoin); err != nil { + logrus.Errorf("failed to send node join on retry: %v", err) + continue + } return case <-stop: return