Skip to content

Commit c03860b

Browse files
committed
Configurable retry delay and jitter
1 parent d76b9dd commit c03860b

File tree

5 files changed

+224
-21
lines changed

5 files changed

+224
-21
lines changed

src/main/java/hudson/remoting/Engine.java

+48-15
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import java.security.cert.CertificateException;
5151
import java.security.cert.X509Certificate;
5252
import java.security.interfaces.RSAPublicKey;
53+
import java.time.Duration;
5354
import java.util.ArrayList;
5455
import java.util.Arrays;
5556
import java.util.HashMap;
@@ -93,7 +94,10 @@
9394
import org.jenkinsci.remoting.protocol.cert.PublicKeyMatchingX509ExtendedTrustManager;
9495
import org.jenkinsci.remoting.protocol.impl.ConnectionRefusalException;
9596
import org.jenkinsci.remoting.util.KeyUtils;
97+
import org.jenkinsci.remoting.util.RetryUtils;
9698
import org.jenkinsci.remoting.util.VersionNumber;
99+
import org.kohsuke.accmod.Restricted;
100+
import org.kohsuke.accmod.restrictions.NoExternalUse;
97101

98102
/**
99103
* Agent engine that proactively connects to Jenkins controller.
@@ -178,6 +182,12 @@ public Thread newThread(@NonNull final Runnable r) {
178182

179183
private boolean noReconnect = false;
180184

185+
private int delay = 10;
186+
187+
private double jitterFactor = 0;
188+
189+
private int jitter = 0;
190+
181191
/**
182192
* Determines whether the socket will have {@link Socket#setKeepAlive(boolean)} set or not.
183193
*
@@ -397,6 +407,21 @@ public void setNoReconnect(boolean noReconnect) {
397407
this.noReconnect = noReconnect;
398408
}
399409

410+
@Restricted(NoExternalUse.class)
411+
public void setDelay(int delay) {
412+
this.delay = delay;
413+
}
414+
415+
@Restricted(NoExternalUse.class)
416+
public void setJitterFactor(double jitterFactor) {
417+
this.jitterFactor = jitterFactor;
418+
}
419+
420+
@Restricted(NoExternalUse.class)
421+
public void setJitter(int jitter) {
422+
this.jitter = jitter;
423+
}
424+
400425
/**
401426
* Determines if JNLPAgentEndpointResolver will not perform certificate validation in the HTTPs mode.
402427
*
@@ -694,8 +719,8 @@ public void closeRead() throws IOException {
694719
}
695720
events.onDisconnect();
696721
while (true) {
697-
// TODO refactor various sleep statements into a common method
698-
TimeUnit.SECONDS.sleep(10);
722+
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
723+
Thread.sleep(duration.toMillis());
699724
// Unlike JnlpAgentEndpointResolver, we do not use $jenkins/tcpSlaveAgentListener/, as that will be a 404 if the TCP port is disabled.
700725
URL ping = new URL(hudsonUrl, "login");
701726
try {
@@ -761,9 +786,9 @@ private void innerRun(IOHub hub, SSLContext context, ExecutorService service) {
761786
endpoint = resolver.resolve();
762787
} catch (IOException e) {
763788
if (!noReconnect) {
764-
events.status("Could not locate server among " + candidateUrls + "; waiting 10 seconds before retry", e);
765-
// TODO refactor various sleep statements into a common method
766-
TimeUnit.SECONDS.sleep(10);
789+
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
790+
events.status("Could not locate server among " + candidateUrls + "; waiting " + RetryUtils.formatDuration(duration) + " seconds before retry", e);
791+
Thread.sleep(duration.toMillis());
767792
continue;
768793
} else {
769794
if (Boolean.getBoolean(Engine.class.getName() + ".nonFatalJnlpAgentEndpointResolutionExceptions")) {
@@ -881,26 +906,34 @@ private void innerRun(IOHub hub, SSLContext context, ExecutorService service) {
881906
}
882907

883908
private JnlpEndpointResolver createEndpointResolver(List<String> jenkinsUrls) {
884-
JnlpEndpointResolver resolver;
885909
if (directConnection == null) {
886910
SSLSocketFactory sslSocketFactory = null;
887911
try {
888912
sslSocketFactory = getSSLSocketFactory();
889913
} catch (Exception e) {
890914
events.error(e);
891915
}
892-
resolver = new JnlpAgentEndpointResolver(jenkinsUrls, credentials, proxyCredentials, tunnel,
893-
sslSocketFactory, disableHttpsCertValidation);
916+
JnlpAgentEndpointResolver jnlpAgentEndpointResolver =
917+
new JnlpAgentEndpointResolver(
918+
jenkinsUrls,
919+
credentials,
920+
proxyCredentials,
921+
tunnel,
922+
sslSocketFactory,
923+
disableHttpsCertValidation);
924+
jnlpAgentEndpointResolver.setDelay(delay);
925+
jnlpAgentEndpointResolver.setJitterFactor(jitterFactor);
926+
jnlpAgentEndpointResolver.setJitter(jitter);
927+
return jnlpAgentEndpointResolver;
894928
} else {
895-
resolver = new JnlpAgentEndpointConfigurator(directConnection, instanceIdentity, protocols);
929+
return new JnlpAgentEndpointConfigurator(directConnection, instanceIdentity, protocols);
896930
}
897-
return resolver;
898931
}
899932

900933
private void onConnectionRejected(String greeting) throws InterruptedException {
901-
events.status("reconnect rejected, sleeping 10s: ", new Exception("The server rejected the connection: " + greeting));
902-
// TODO refactor various sleep statements into a common method
903-
TimeUnit.SECONDS.sleep(10);
934+
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
935+
events.status("reconnect rejected, sleeping " + RetryUtils.formatDuration(duration) + "s: ", new Exception("The server rejected the connection: " + greeting));
936+
Thread.sleep(duration.toMillis());
904937
}
905938

906939
/**
@@ -922,8 +955,8 @@ private Socket connectTcp(@NonNull JnlpAgentEndpoint endpoint) throws IOExceptio
922955
if(retry++>10) {
923956
throw e;
924957
}
925-
// TODO refactor various sleep statements into a common method
926-
TimeUnit.SECONDS.sleep(10);
958+
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
959+
Thread.sleep(duration.toMillis());
927960
events.status(msg+" (retrying:"+retry+")",e);
928961
}
929962
}

src/main/java/hudson/remoting/Launcher.java

+49-4
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import hudson.remoting.Channel.Mode;
3030
import org.jenkinsci.remoting.engine.WorkDirManager;
3131
import org.jenkinsci.remoting.util.PathUtils;
32+
import org.jenkinsci.remoting.util.RetryUtils;
3233
import org.jenkinsci.remoting.util.https.NoCheckHostnameVerifier;
3334
import org.jenkinsci.remoting.util.https.NoCheckTrustManager;
3435
import org.kohsuke.args4j.CmdLineException;
@@ -84,14 +85,14 @@
8485
import java.security.SecureRandom;
8586
import java.security.cert.CertificateException;
8687
import java.security.cert.CertificateFactory;
88+
import java.time.Duration;
8789
import java.util.ArrayList;
8890
import java.util.Base64;
8991
import java.util.List;
9092
import java.util.Locale;
9193
import java.util.Properties;
9294
import java.util.concurrent.ExecutorService;
9395
import java.util.concurrent.Executors;
94-
import java.util.concurrent.TimeUnit;
9596
import java.util.logging.Level;
9697
import java.util.logging.Logger;
9798

@@ -228,6 +229,25 @@ public void setNoCertificateCheck(boolean ignored) throws NoSuchAlgorithmExcepti
228229
@Option(name="-noReconnect",usage="Doesn't try to reconnect when a communication fail, and exit instead")
229230
public boolean noReconnect = false;
230231

232+
@Option(name = "-retryDelay",
233+
usage = "The fixed delay to occur between retries. Default is 10 seconds.")
234+
public int delay = 10;
235+
236+
@Option(name = "-retryJitterFactor",
237+
usage = "The jitter factor to randomly vary retry delays by. For each retry delay, a"
238+
+ " random portion of the delay multiplied by the jitter factor will be"
239+
+ " added or subtracted to the delay. For example: a retry delay of 10"
240+
+ " seconds and a jitter factor of .25 will result in a random retry delay"
241+
+ " between 7.5 and 12.5 seconds.")
242+
public double jitterFactor = 0;
243+
244+
@Option(name = "-retryJitter",
245+
usage = "The jitter to randomly vary retry delays by. For each retry delay, a random"
246+
+ " portion of the jitter will be added or subtracted to the delay. For"
247+
+ " example: a jitter of 5 seconds will randomly add between -5 and 5"
248+
+ " seconds to each retry delay.")
249+
public int jitter = 0;
250+
231251
@Option(name = "-noKeepAlive",
232252
usage = "Disable TCP socket keep alive on connection to the controller.")
233253
public boolean noKeepAlive = false;
@@ -313,6 +333,19 @@ public void run() throws Exception {
313333
return;
314334
}
315335

336+
if (jitterFactor != 0 && jitter != 0) {
337+
throw new CmdLineException("Jitter factor and jitter are mutually exclusive");
338+
}
339+
if (jitterFactor < 0 || jitterFactor > 1) {
340+
throw new CmdLineException("Jitter factor must be >= 0 and <= 1");
341+
}
342+
if (jitter < 0) {
343+
throw new CmdLineException("Jitter must be >= 0");
344+
}
345+
if (jitter > 0 && jitter >= delay) {
346+
throw new CmdLineException("Jitter must be < delay");
347+
}
348+
316349
// Create and verify working directory and logging
317350
// TODO: The pass-through for the JNLP mode has been added in JENKINS-39817. But we still need to keep this parameter in
318351
// consideration for other modes (TcpServer, TcpClient, etc.) to retain the legacy behavior.
@@ -351,6 +384,18 @@ public void run() throws Exception {
351384
if (this.noReconnect) {
352385
jnlpArgs.add("-noreconnect");
353386
}
387+
if (this.delay != 10) {
388+
jnlpArgs.add("-retryDelay");
389+
jnlpArgs.add(Integer.toString(this.delay));
390+
}
391+
if (this.jitterFactor != 0) {
392+
jnlpArgs.add("-retryJitterFactor");
393+
jnlpArgs.add(Double.toString(this.jitterFactor));
394+
}
395+
if (this.jitter != 0) {
396+
jnlpArgs.add("-retryJitter");
397+
jnlpArgs.add(Integer.toString(this.jitter));
398+
}
354399
if (this.noKeepAlive) {
355400
jnlpArgs.add("-noKeepAlive");
356401
}
@@ -568,9 +613,9 @@ public List<String> parseJnlpArguments() throws ParserConfigurationException, SA
568613

569614
System.err.println("Failed to obtain "+ agentJnlpURL);
570615
e.printStackTrace(System.err);
571-
System.err.println("Waiting 10 seconds before retry");
572-
// TODO refactor various sleep statements into a common method
573-
TimeUnit.SECONDS.sleep(10);
616+
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
617+
System.err.println("Waiting " + RetryUtils.formatDuration(duration) + " seconds before retry");
618+
Thread.sleep(duration.toMillis());
574619
// retry
575620
} finally {
576621
if (con instanceof HttpURLConnection) {

src/main/java/hudson/remoting/jnlp/Main.java

+34
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,25 @@ public class Main {
104104
usage="If the connection ends, don't retry and just exit.")
105105
public boolean noReconnect = false;
106106

107+
@Option(name = "-retryDelay",
108+
usage = "The fixed delay to occur between retries. Default is 10 seconds.")
109+
public int delay = 10;
110+
111+
@Option(name = "-retryJitterFactor",
112+
usage = "The jitter factor to randomly vary retry delays by. For each retry delay, a"
113+
+ " random portion of the delay multiplied by the jitter factor will be"
114+
+ " added or subtracted to the delay. For example: a retry delay of 10"
115+
+ " seconds and a jitter factor of .25 will result in a random retry delay"
116+
+ " between 7.5 and 12.5 seconds.")
117+
public double jitterFactor = 0;
118+
119+
@Option(name = "-retryJitter",
120+
usage = "The jitter to randomly vary retry delays by. For each retry delay, a random"
121+
+ " portion of the jitter will be added or subtracted to the delay. For"
122+
+ " example: a jitter of 5 seconds will randomly add between -5 and 5"
123+
+ " seconds to each retry delay.")
124+
public int jitter = 0;
125+
107126
@Option(name="-noKeepAlive",
108127
usage="Disable TCP socket keep alive on connection to the controller.")
109128
public boolean noKeepAlive = false;
@@ -268,6 +287,18 @@ public static void _main(String[] args) throws IOException, InterruptedException
268287
throw new CmdLineException(p, "-webSocket supports only a single -url", null);
269288
}
270289
}
290+
if (m.jitterFactor != 0 && m.jitter != 0) {
291+
throw new CmdLineException("Jitter factor and jitter are mutually exclusive");
292+
}
293+
if (m.jitterFactor < 0 || m.jitterFactor > 1) {
294+
throw new CmdLineException("Jitter factor must be >= 0 and <= 1");
295+
}
296+
if (m.jitter < 0) {
297+
throw new CmdLineException("Jitter must be >= 0");
298+
}
299+
if (m.jitter > 0 && m.jitter >= m.delay) {
300+
throw new CmdLineException("Jitter must be < delay");
301+
}
271302
m.main();
272303
}
273304

@@ -304,6 +335,9 @@ public Engine createEngine() {
304335
if(jarCache!=null)
305336
engine.setJarCache(new FileSystemJarCache(jarCache,true));
306337
engine.setNoReconnect(noReconnect);
338+
engine.setDelay(delay);
339+
engine.setJitterFactor(jitterFactor);
340+
engine.setJitter(jitter);
307341
engine.setKeepAlive(!noKeepAlive);
308342

309343
if (disableHttpsCertValidation) {

src/main/java/org/jenkinsci/remoting/engine/JnlpAgentEndpointResolver.java

+27-2
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,12 @@
2929
import hudson.remoting.Engine;
3030
import hudson.remoting.Launcher;
3131
import hudson.remoting.NoProxyEvaluator;
32+
import org.jenkinsci.remoting.util.RetryUtils;
3233
import org.jenkinsci.remoting.util.VersionNumber;
3334
import org.jenkinsci.remoting.util.https.NoCheckHostnameVerifier;
3435
import org.jenkinsci.remoting.util.https.NoCheckTrustManager;
36+
import org.kohsuke.accmod.Restricted;
37+
import org.kohsuke.accmod.restrictions.NoExternalUse;
3538

3639
import javax.net.ssl.HttpsURLConnection;
3740
import javax.net.ssl.SSLContext;
@@ -58,6 +61,7 @@
5861
import java.security.SecureRandom;
5962
import java.security.interfaces.RSAPublicKey;
6063
import java.security.spec.InvalidKeySpecException;
64+
import java.time.Duration;
6165
import java.util.ArrayList;
6266
import java.util.Arrays;
6367
import java.util.Base64;
@@ -98,6 +102,12 @@ public class JnlpAgentEndpointResolver extends JnlpEndpointResolver {
98102

99103
private boolean disableHttpsCertValidation;
100104

105+
private int delay = 10;
106+
107+
private double jitterFactor = 0;
108+
109+
private int jitter = 0;
110+
101111
/**
102112
* If specified, only the protocols from the list will be tried during the connection.
103113
* The option provides protocol names, but the order of the check is defined internally and cannot be changed.
@@ -185,6 +195,21 @@ public void setDisableHttpsCertValidation(boolean disableHttpsCertValidation) {
185195
this.disableHttpsCertValidation = disableHttpsCertValidation;
186196
}
187197

198+
@Restricted(NoExternalUse.class)
199+
public void setDelay(int delay) {
200+
this.delay = delay;
201+
}
202+
203+
@Restricted(NoExternalUse.class)
204+
public void setJitterFactor(double jitterFactor) {
205+
this.jitterFactor = jitterFactor;
206+
}
207+
208+
@Restricted(NoExternalUse.class)
209+
public void setJitter(int jitter) {
210+
this.jitter = jitter;
211+
}
212+
188213
@CheckForNull
189214
@Override
190215
public JnlpAgentEndpoint resolve() throws IOException {
@@ -380,8 +405,8 @@ public void waitForReady() throws InterruptedException {
380405
try {
381406
int retries = 0;
382407
while (true) {
383-
// TODO refactor various sleep statements into a common method
384-
Thread.sleep(1000 * 10);
408+
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
409+
Thread.sleep(duration.toMillis());
385410
try {
386411
// Jenkins top page might be read-protected. see http://www.nabble
387412
// .com/more-lenient-retry-logic-in-Engine.waitForServerToBack-td24703172.html

0 commit comments

Comments
 (0)