Skip to content

Commit b6c05d0

Browse files
committed
Configurable retry delay and jitter
1 parent c3b7249 commit b6c05d0

File tree

4 files changed

+177
-21
lines changed

4 files changed

+177
-21
lines changed

src/main/java/hudson/remoting/Engine.java

+47-15
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
import java.security.cert.CertificateException;
5353
import java.security.cert.X509Certificate;
5454
import java.security.interfaces.RSAPublicKey;
55+
import java.time.Duration;
5556
import java.util.ArrayList;
5657
import java.util.Arrays;
5758
import java.util.Base64;
@@ -101,6 +102,7 @@
101102
import org.jenkinsci.remoting.protocol.cert.PublicKeyMatchingX509ExtendedTrustManager;
102103
import org.jenkinsci.remoting.protocol.impl.ConnectionRefusalException;
103104
import org.jenkinsci.remoting.util.KeyUtils;
105+
import org.jenkinsci.remoting.util.RetryUtils;
104106
import org.jenkinsci.remoting.util.VersionNumber;
105107
import org.jenkinsci.remoting.util.https.NoCheckHostnameVerifier;
106108
import org.jenkinsci.remoting.util.https.NoCheckTrustManager;
@@ -193,6 +195,12 @@ public Thread newThread(@NonNull final Runnable r) {
193195

194196
private boolean noReconnect = false;
195197

198+
private int delay = 10;
199+
200+
private double jitterFactor = 0;
201+
202+
private int jitter = 0;
203+
196204
/**
197205
* Determines whether the socket will have {@link Socket#setKeepAlive(boolean)} set or not.
198206
*
@@ -412,6 +420,21 @@ public void setNoReconnect(boolean noReconnect) {
412420
this.noReconnect = noReconnect;
413421
}
414422

423+
@Restricted(NoExternalUse.class)
424+
public void setDelay(int delay) {
425+
this.delay = delay;
426+
}
427+
428+
@Restricted(NoExternalUse.class)
429+
public void setJitterFactor(double jitterFactor) {
430+
this.jitterFactor = jitterFactor;
431+
}
432+
433+
@Restricted(NoExternalUse.class)
434+
public void setJitter(int jitter) {
435+
this.jitter = jitter;
436+
}
437+
415438
/**
416439
* Determines if JNLPAgentEndpointResolver will not perform certificate validation in the HTTPs mode.
417440
*
@@ -742,8 +765,8 @@ public void closeRead() throws IOException {
742765
}
743766
events.onDisconnect();
744767
while (true) {
745-
// TODO refactor various sleep statements into a common method
746-
TimeUnit.SECONDS.sleep(10);
768+
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
769+
Thread.sleep(duration.toMillis());
747770
// Unlike JnlpAgentEndpointResolver, we do not use $jenkins/tcpSlaveAgentListener/, as that will be a 404 if the TCP port is disabled.
748771
URL ping = new URL(hudsonUrl, "login");
749772
try {
@@ -809,9 +832,9 @@ private void innerRun(IOHub hub, SSLContext context, ExecutorService service) {
809832
endpoint = resolver.resolve();
810833
} catch (IOException e) {
811834
if (!noReconnect) {
812-
events.status("Could not locate server among " + candidateUrls + "; waiting 10 seconds before retry", e);
813-
// TODO refactor various sleep statements into a common method
814-
TimeUnit.SECONDS.sleep(10);
835+
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
836+
events.status("Could not locate server among " + candidateUrls + "; waiting " + RetryUtils.formatDuration(duration) + " seconds before retry", e);
837+
Thread.sleep(duration.toMillis());
815838
continue;
816839
} else {
817840
if (Boolean.getBoolean(Engine.class.getName() + ".nonFatalJnlpAgentEndpointResolutionExceptions")) {
@@ -929,26 +952,35 @@ private void innerRun(IOHub hub, SSLContext context, ExecutorService service) {
929952
}
930953

931954
private JnlpEndpointResolver createEndpointResolver(List<String> jenkinsUrls, String agentName) {
932-
JnlpEndpointResolver resolver;
933955
if (directConnection == null) {
934956
SSLSocketFactory sslSocketFactory = null;
935957
try {
936958
sslSocketFactory = getSSLSocketFactory(candidateCertificates, disableHttpsCertValidation);
937959
} catch (Exception e) {
938960
events.error(e);
939961
}
940-
resolver = new JnlpAgentEndpointResolver(jenkinsUrls, agentName, credentials, proxyCredentials, tunnel,
941-
sslSocketFactory, disableHttpsCertValidation);
962+
JnlpAgentEndpointResolver jnlpAgentEndpointResolver =
963+
new JnlpAgentEndpointResolver(
964+
jenkinsUrls,
965+
agentName,
966+
credentials,
967+
proxyCredentials,
968+
tunnel,
969+
sslSocketFactory,
970+
disableHttpsCertValidation);
971+
jnlpAgentEndpointResolver.setDelay(delay);
972+
jnlpAgentEndpointResolver.setJitterFactor(jitterFactor);
973+
jnlpAgentEndpointResolver.setJitter(jitter);
974+
return jnlpAgentEndpointResolver;
942975
} else {
943-
resolver = new JnlpAgentEndpointConfigurator(directConnection, instanceIdentity, protocols, proxyCredentials);
976+
return new JnlpAgentEndpointConfigurator(directConnection, instanceIdentity, protocols, proxyCredentials);
944977
}
945-
return resolver;
946978
}
947979

948980
private void onConnectionRejected(String greeting) throws InterruptedException {
949-
events.status("reconnect rejected, sleeping 10s: ", new Exception("The server rejected the connection: " + greeting));
950-
// TODO refactor various sleep statements into a common method
951-
TimeUnit.SECONDS.sleep(10);
981+
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
982+
events.status("reconnect rejected, sleeping " + RetryUtils.formatDuration(duration) + "s: ", new Exception("The server rejected the connection: " + greeting));
983+
Thread.sleep(duration.toMillis());
952984
}
953985

954986
/**
@@ -970,8 +1002,8 @@ private Socket connectTcp(@NonNull JnlpAgentEndpoint endpoint) throws IOExceptio
9701002
if(retry++>10) {
9711003
throw e;
9721004
}
973-
// TODO refactor various sleep statements into a common method
974-
TimeUnit.SECONDS.sleep(10);
1005+
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
1006+
Thread.sleep(duration.toMillis());
9751007
events.status(msg+" (retrying:"+retry+")",e);
9761008
}
9771009
}

src/main/java/hudson/remoting/Launcher.java

+38-4
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.jenkinsci.remoting.engine.JnlpAgentEndpointResolver;
3131
import org.jenkinsci.remoting.engine.WorkDirManager;
3232
import org.jenkinsci.remoting.util.PathUtils;
33+
import org.jenkinsci.remoting.util.RetryUtils;
3334
import org.jenkinsci.remoting.util.https.NoCheckHostnameVerifier;
3435
import org.kohsuke.args4j.Argument;
3536
import org.kohsuke.args4j.CmdLineException;
@@ -77,6 +78,7 @@
7778
import java.security.cert.CertificateException;
7879
import java.security.cert.CertificateFactory;
7980
import java.security.cert.X509Certificate;
81+
import java.time.Duration;
8082
import java.util.ArrayList;
8183
import java.util.HashSet;
8284
import java.util.List;
@@ -85,7 +87,6 @@
8587
import java.util.Properties;
8688
import java.util.concurrent.ExecutorService;
8789
import java.util.concurrent.Executors;
88-
import java.util.concurrent.TimeUnit;
8990
import java.util.logging.Level;
9091
import java.util.logging.Logger;
9192

@@ -237,6 +238,25 @@ public void setNoCertificateCheck(boolean ignored) throws NoSuchAlgorithmExcepti
237238
@Option(name="-noReconnect",aliases="-noreconnect",usage="Doesn't try to reconnect when a communication fail, and exit instead")
238239
public boolean noReconnect = false;
239240

241+
@Option(name = "-retryDelay",
242+
usage = "The fixed delay to occur between retries. Default is 10 seconds.")
243+
public int delay = 10;
244+
245+
@Option(name = "-retryJitterFactor",
246+
usage = "The jitter factor to randomly vary retry delays by. For each retry delay, a"
247+
+ " random portion of the delay multiplied by the jitter factor will be"
248+
+ " added or subtracted to the delay. For example: a retry delay of 10"
249+
+ " seconds and a jitter factor of .25 will result in a random retry delay"
250+
+ " between 7.5 and 12.5 seconds.")
251+
public double jitterFactor = 0;
252+
253+
@Option(name = "-retryJitter",
254+
usage = "The jitter to randomly vary retry delays by. For each retry delay, a random"
255+
+ " portion of the jitter will be added or subtracted to the delay. For"
256+
+ " example: a jitter of 5 seconds will randomly add between -5 and 5"
257+
+ " seconds to each retry delay.")
258+
public int jitter = 0;
259+
240260
@Option(name = "-noKeepAlive",
241261
usage = "Disable TCP socket keep alive on connection to the controller.")
242262
public boolean noKeepAlive = false;
@@ -419,6 +439,20 @@ private synchronized void initialize() throws IOException {
419439
if (initialized) {
420440
throw new IllegalStateException("double initialization");
421441
}
442+
443+
if (jitterFactor != 0 && jitter != 0) {
444+
throw new IllegalArgumentException("Jitter factor and jitter are mutually exclusive");
445+
}
446+
if (jitterFactor < 0 || jitterFactor > 1) {
447+
throw new IllegalArgumentException("Jitter factor must be >= 0 and <= 1");
448+
}
449+
if (jitter < 0) {
450+
throw new IllegalArgumentException("Jitter must be >= 0");
451+
}
452+
if (jitter > 0 && jitter >= delay) {
453+
throw new IllegalArgumentException("Jitter must be < delay");
454+
}
455+
422456
// Create and verify working directory and logging
423457
// TODO: The pass-through for the JNLP mode has been added in JENKINS-39817. But we still need to keep this parameter in
424458
// consideration for other modes (TcpServer, TcpClient, etc.) to retain the legacy behavior.
@@ -714,9 +748,9 @@ public List<String> parseJnlpArguments() throws ParserConfigurationException, SA
714748

715749
System.err.println("Failed to obtain "+ agentJnlpURL);
716750
e.printStackTrace(System.err);
717-
System.err.println("Waiting 10 seconds before retry");
718-
// TODO refactor various sleep statements into a common method
719-
TimeUnit.SECONDS.sleep(10);
751+
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
752+
System.err.println("Waiting " + RetryUtils.formatDuration(duration) + " seconds before retry");
753+
Thread.sleep(duration.toMillis());
720754
// retry
721755
} finally {
722756
if (con instanceof HttpURLConnection) {

src/main/java/org/jenkinsci/remoting/engine/JnlpAgentEndpointResolver.java

+25-2
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import hudson.remoting.Engine;
3030
import hudson.remoting.Launcher;
3131
import hudson.remoting.NoProxyEvaluator;
32+
import org.jenkinsci.remoting.util.RetryUtils;
3233
import org.jenkinsci.remoting.util.VersionNumber;
3334
import org.jenkinsci.remoting.util.https.NoCheckHostnameVerifier;
3435
import org.kohsuke.accmod.Restricted;
@@ -57,6 +58,7 @@
5758
import java.nio.charset.StandardCharsets;
5859
import java.security.interfaces.RSAPublicKey;
5960
import java.security.spec.InvalidKeySpecException;
61+
import java.time.Duration;
6062
import java.util.ArrayList;
6163
import java.util.Base64;
6264
import java.util.Iterator;
@@ -100,6 +102,12 @@ public class JnlpAgentEndpointResolver extends JnlpEndpointResolver {
100102

101103
private HostnameVerifier hostnameVerifier;
102104

105+
private int delay = 10;
106+
107+
private double jitterFactor = 0;
108+
109+
private int jitter = 0;
110+
103111
/**
104112
* If specified, only the protocols from the list will be tried during the connection.
105113
* The option provides protocol names, but the order of the check is defined internally and cannot be changed.
@@ -185,6 +193,21 @@ public void setDisableHttpsCertValidation(boolean disableHttpsCertValidation) {
185193
}
186194
}
187195

196+
@Restricted(NoExternalUse.class)
197+
public void setDelay(int delay) {
198+
this.delay = delay;
199+
}
200+
201+
@Restricted(NoExternalUse.class)
202+
public void setJitterFactor(double jitterFactor) {
203+
this.jitterFactor = jitterFactor;
204+
}
205+
206+
@Restricted(NoExternalUse.class)
207+
public void setJitter(int jitter) {
208+
this.jitter = jitter;
209+
}
210+
188211
@CheckForNull
189212
@Override
190213
public JnlpAgentEndpoint resolve() throws IOException {
@@ -402,8 +425,8 @@ public void waitForReady() throws InterruptedException {
402425
try {
403426
int retries = 0;
404427
while (true) {
405-
// TODO refactor various sleep statements into a common method
406-
Thread.sleep(1000 * 10);
428+
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
429+
Thread.sleep(duration.toMillis());
407430
// Jenkins top page might be read-protected. see http://www.nabble
408431
// .com/more-lenient-retry-logic-in-Engine.waitForServerToBack-td24703172.html
409432
if (jenkinsUrls.isEmpty()) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright (c) 2023, CloudBees, Inc.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
25+
package org.jenkinsci.remoting.util;
26+
27+
import java.security.SecureRandom;
28+
import java.text.NumberFormat;
29+
import java.time.Duration;
30+
import java.util.Random;
31+
import org.kohsuke.accmod.Restricted;
32+
import org.kohsuke.accmod.restrictions.NoExternalUse;
33+
34+
/**
35+
* Retry-related utility methods. Used in place of a library like <a
36+
* href="https://failsafe.dev/">Failsafe</a> to minimize external third-party dependencies.
37+
*/
38+
@Restricted(NoExternalUse.class)
39+
public class RetryUtils {
40+
41+
private static final Random RANDOM = new SecureRandom();
42+
43+
// Suppress default constructor for noninstantiability
44+
private RetryUtils() {
45+
throw new AssertionError();
46+
}
47+
48+
/**
49+
* Get the retry duration based on the CLI arguments.
50+
*/
51+
public static Duration getDuration(int delay, double jitterFactor, int jitter) {
52+
if (jitterFactor != 0) {
53+
double randomFactor = 1 + (1 - RANDOM.nextDouble() * 2) * jitterFactor;
54+
return Duration.ofMillis((long) (Duration.ofSeconds(delay).toMillis() * randomFactor));
55+
} else if (jitter != 0) {
56+
double randomAddend =
57+
(1 - RANDOM.nextDouble() * 2) * Duration.ofSeconds(jitter).toMillis();
58+
return Duration.ofMillis((long) (Duration.ofSeconds(delay).toMillis() + randomAddend));
59+
} else {
60+
return Duration.ofSeconds(delay);
61+
}
62+
}
63+
64+
public static String formatDuration(Duration duration) {
65+
return NumberFormat.getNumberInstance().format(duration.toMillis() / 1000.0);
66+
}
67+
}

0 commit comments

Comments
 (0)