internal/testenv: adjust timeout calculations in CommandContext

Bryan C. Mills · gopherbot · commit 082afccebfc0 · 2022-11-01T21:32:24.000Z
I noticed some test failures in the build dashboard after CL 445597 that made me realize the grace period should be based on the test timeout, not the Context timeout: if the test itself sets a short timeout for a command, we still want to give the test process enough time to consume and log its output. I also put some more thought into how one might debug a test hang, and realized that in that case we don't want to set a WaitDelay at all: instead, we want to leave the processes in their stuck state so that they can be investigated with tools like `ps` and 'lsof'. Updates #50436. Change-Id: I65421084f44eeaaaec5dd2741cd836e9e68dd380 Reviewed-on: https://go-review.googlesource.com/c/go/+/446875 Auto-Submit: Bryan Mills <bcmills@google.com> Reviewed-by: Ian Lance Taylor <iant@google.com> Run-TryBot: Bryan Mills <bcmills@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
diff --git a/src/internal/testenv/exec.go b/src/internal/testenv/exec.go
@@ -79,63 +79,65 @@ func CleanCmdEnv(cmd *exec.Cmd) *exec.Cmd {
 //   - skips t if the platform does not support os/exec,
 //   - sends SIGQUIT (if supported by the platform) instead of SIGKILL
 //     in its Cancel function
-//   - adds a timeout (with an arbitrary grace period) before the test's deadline expires,
-//   - sets a WaitDelay for an arbitrary grace period,
+//   - if the test has a deadline, adds a Context timeout and WaitDelay
+//     for an arbitrary grace period before the test's deadline expires,
 //   - fails the test if the command does not complete before the test's deadline, and
 //   - sets a Cleanup function that verifies that the test did not leak a subprocess.
 func CommandContext(t testing.TB, ctx context.Context, name string, args ...string) *exec.Cmd {
 	t.Helper()
 	MustHaveExec(t)
 
 	var (
-		gracePeriod = 100 * time.Millisecond
-		cancel      context.CancelFunc
+		cancelCtx   context.CancelFunc
+		gracePeriod time.Duration // unlimited unless the test has a deadline (to allow for interactive debugging)
 	)
-	if s := os.Getenv("GO_TEST_TIMEOUT_SCALE"); s != "" {
-		scale, err := strconv.Atoi(s)
-		if err != nil {
-			t.Fatalf("invalid GO_TEST_TIMEOUT_SCALE: %v", err)
-		}
-		gracePeriod *= time.Duration(scale)
-	}
 
 	if t, ok := t.(interface {
 		testing.TB
 		Deadline() (time.Time, bool)
 	}); ok {
 		if td, ok := t.Deadline(); ok {
-			if cd, ok := ctx.Deadline(); !ok || cd.Sub(td) > gracePeriod {
-				// Either ctx doesn't have a deadline, or its deadline would expire
-				// after (or too close before) the test has already timed out.
-				// Compute a new timeout that will expire before the test does so that
-				// we can terminate the subprocess with a more useful signal.
-
-				timeout := time.Until(td)
-
-				// If time allows, increase the termination grace period to 5% of the
-				// remaining time.
-				if gp := timeout / 20; gp > gracePeriod {
-					gracePeriod = gp
+			// Start with a minimum grace period, just long enough to consume the
+			// output of a reasonable program after it terminates.
+			gracePeriod = 100 * time.Millisecond
+			if s := os.Getenv("GO_TEST_TIMEOUT_SCALE"); s != "" {
+				scale, err := strconv.Atoi(s)
+				if err != nil {
+					t.Fatalf("invalid GO_TEST_TIMEOUT_SCALE: %v", err)
 				}
+				gracePeriod *= time.Duration(scale)
+			}
+
+			// If time allows, increase the termination grace period to 5% of the
+			// test's remaining time.
+			testTimeout := time.Until(td)
+			if gp := testTimeout / 20; gp > gracePeriod {
+				gracePeriod = gp
+			}
 
-				// When we run commands that execute subprocesses, we want to reserve two
-				// grace periods to clean up. We will send the first termination signal when
-				// the context expires, then wait one grace period for the process to
-				// produce whatever useful output it can (such as a stack trace). After the
-				// first grace period expires, we'll escalate to os.Kill, leaving the second
-				// grace period for the test function to record its output before the test
-				// process itself terminates.
-				timeout -= 2 * gracePeriod
-
-				ctx, cancel = context.WithTimeout(ctx, timeout)
-				t.Cleanup(cancel)
+			// When we run commands that execute subprocesses, we want to reserve two
+			// grace periods to clean up: one for the delay between the first
+			// termination signal being sent (via the Cancel callback when the Context
+			// expires) and the process being forcibly terminated (via the WaitDelay
+			// field), and a second one for the delay becween the process being
+			// terminated and and the test logging its output for debugging.
+			//
+			// (We want to ensure that the test process itself has enough time to
+			// log the output before it is also terminated.)
+			cmdTimeout := testTimeout - 2*gracePeriod
+
+			if cd, ok := ctx.Deadline(); !ok || time.Until(cd) > cmdTimeout {
+				// Either ctx doesn't have a deadline, or its deadline would expire
+				// after (or too close before) the test has already timed out.
+				// Add a shorter timeout so that the test will produce useful output.
+				ctx, cancelCtx = context.WithTimeout(ctx, cmdTimeout)
 			}
 		}
 	}
 
 	cmd := exec.CommandContext(ctx, name, args...)
 	cmd.Cancel = func() error {
-		if cancel != nil && ctx.Err() == context.DeadlineExceeded {
+		if cancelCtx != nil && ctx.Err() == context.DeadlineExceeded {
 			// The command timed out due to running too close to the test's deadline.
 			// There is no way the test did that intentionally — it's too close to the
 			// wire! — so mark it as a test failure. That way, if the test expects the
@@ -154,8 +156,8 @@ func CommandContext(t testing.TB, ctx context.Context, name string, args ...stri
 	cmd.WaitDelay = gracePeriod
 
 	t.Cleanup(func() {
-		if cancel != nil {
-			cancel()
+		if cancelCtx != nil {
+			cancelCtx()
 		}
 		if cmd.Process != nil && cmd.ProcessState == nil {
 			t.Errorf("command was started, but test did not wait for it to complete: %v", cmd)