diff --git a/README.md b/README.md index d5e886f333..e0daa215d6 100644 --- a/README.md +++ b/README.md @@ -1310,13 +1310,29 @@ We envision that `RunnerSet` will eventually replace `RunnerDeployment`, as `Run ### Ephemeral Runners -Both `RunnerDeployment` and `RunnerSet` has ability to configure `ephemeral: true` in the spec. +Every runner managed by ARC is "ephemeral" by default. We call it an ephemeral runner. -When it is configured, it passes a `--once` flag to every runner. +The life of an ephemeral runner managed by ARC looks like this- ARC creates a runner pod for the runner. As it's an ephemeral runner, the `--ephemeral` flag is passed to the `actions/runner` agent that runs within the `runner` container of the runner pod. -`--once` is an experimental `actions/runner` feature that instructs the runner to stop after the first job run. It has a known race condition issue that means the runner may fetch a job even when it's being terminated. If a runner fetched a job while terminating, the job is very likely to fail because the terminating runner doesn't wait for the job to complete. This is tracked in issue [#466](https://github.com/actions-runner-controller/actions-runner-controller/issues/466). +`--ephemeral` is an `actions/runner` feature that instructs the runner to stop and de-register itseulf after the first job run. -Since the implementation of the `--once` flag GitHub have implemented the `--ephemeral` flag which has no known race conditions and is much more supported by GitHub, this is the prefered flag for ephemeral runners. To have your `RunnerDeployment` and `RunnerSet` kinds use this new flag instead of the `--once` flag set `RUNNER_FEATURE_FLAG_EPHEMERAL` to `"true"`. For example, a `RunnerSet` configured to use the new flag looks like: +Once the ephemeral runner has completed running a workflow job, it stops with a status code of 0, hence the runner pod is marked as completed, removed by ARC. + +As it's removed after a workflow job run, the runner pod is never reused across multiple GitHub Actions workflow jobs, providing you a clean environment per each workflow job. + +Although not recommended, it's possible to disable passing `--ephemeral` flag by explicitly setting `ephemeral: false` in the `RunnerDeployment` or `RunnerSet` spec. When disabled, your runner becomes "static". A static runner does not stop after workflow job run, and `actions/runner` is known to clean only runner's work dir after each job. That means your runner's environment, including various actions cache, docker images stored in the `dind` and layer cache, is retained across multiple workflow job runs. It may worth trying only when you do want to prioritize job run speed more than job reliability and security. + +> In early days of the project, the flag passed to the runner when `ephemeral: true` was `--once` rather than `--ephemeral`. +> +> `--once` had a known race condition issue that means the runner may fetch a job even when it's being terminated. If a runner fetched a job while terminating, the job is very likely to fail because the terminating runner doesn't wait for the job to complete. This was tracked in issue [#466](https://github.com/actions-runner-controller/actions-runner-controller/issues/466). +> +> Later, GitHub have implemented the `--ephemeral` flag which has no known race conditions and was much more supported by GitHub, and it became the prefered flag for ephemeral runners. +> +> To leverage `--ephemeral`, ARC added an environment variable based feature flag to the runner image, `RUNNER_FEATURE_FLAG_EPHEMERAL`, in ARC 0.20.0. Once it was set to `"true"` via `RunnerDeployment` or `RunnerSet` spec, the runner was configured to use `--ephemeral` instead of `--once`. +> +> Since ARC 0.22.0, `--ephemeral` is used by default. Every runner without the `RUNNER_FEATURE_FLAG_EPHEMERAL` environment variable uses `--ephmeral`. You can still opt-out of it by setting the value to `"false"`, but we don't think there's real need to do. Therefore, the ability to opt-out will soon be dropped, maybe in ARC 0.23.0. + +If you're upgrading from pre-0.22.0, your RunnerDeployment and RunnerSet spec might look like below: ```yaml kind: RunnerSet @@ -1337,9 +1353,7 @@ spec: value: "true" ``` -You should configure all your ephemeral runners to use the new flag unless you have a reason for needing to use the old flag. - -Once able, `actions-runner-controller` will make `--ephemeral` the default option for `ephemeral: true` runners and potentially remove `--once` entirely. It is likely that in the future the `--once` flag will be officially deprecated by GitHub and subsquently removed in `actions/runner`. +Since 0.22.0, you can simply omit the `RUNNER_FEATURE_FLAG_EPHEMERAL` env, and your runner remains "ephemeral". ### Software Installed in the Runner Image diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 6fd816f2c1..19ee36aaee 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -50,10 +50,12 @@ const ( // This is an annotation internal to actions-runner-controller and can change in backward-incompatible ways annotationKeyRegistrationOnly = "actions-runner-controller/registration-only" - EnvVarOrg = "RUNNER_ORG" - EnvVarRepo = "RUNNER_REPO" - EnvVarEnterprise = "RUNNER_ENTERPRISE" - EnvVarEphemeral = "RUNNER_EPHEMERAL" + EnvVarOrg = "RUNNER_ORG" + EnvVarRepo = "RUNNER_REPO" + EnvVarEnterprise = "RUNNER_ENTERPRISE" + EnvVarEphemeral = "RUNNER_EPHEMERAL" + EnvVarRunnerFeatureFlagEphemeral = "RUNNER_FEATURE_FLAG_EPHEMERAL" + EnvVarTrue = "true" ) // RunnerReconciler reconciles a Runner object @@ -812,6 +814,12 @@ func newRunnerPod(runnerName string, template corev1.Pod, runnerSpec v1alpha1.Ru } } + // TODO Remove this once we remove RUNNER_FEATURE_FLAG_EPHEMERAL from runner's entrypoint.sh + // and make --ephemeral the default option. + if getRunnerEnv(pod, EnvVarRunnerFeatureFlagEphemeral) == "" { + setRunnerEnv(pod, EnvVarRunnerFeatureFlagEphemeral, EnvVarTrue) + } + return *pod, nil } diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go index 2a18b33868..d95d94a8ad 100644 --- a/controllers/runner_graceful_stop.go +++ b/controllers/runner_graceful_stop.go @@ -283,6 +283,21 @@ func getRunnerEnv(pod *corev1.Pod, key string) string { return "" } +func setRunnerEnv(pod *corev1.Pod, key, value string) { + for i := range pod.Spec.Containers { + c := pod.Spec.Containers[i] + if c.Name == containerName { + for j, env := range c.Env { + if env.Name == key { + pod.Spec.Containers[i].Env[j].Value = value + return + } + } + pod.Spec.Containers[i].Env = append(c.Env, corev1.EnvVar{Name: key, Value: value}) + } + } +} + // unregisterRunner unregisters the runner from GitHub Actions by name. // // This function returns: diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 0d58b5ffc2..7bbab05d1a 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -220,7 +220,7 @@ func initTestEnv(t *testing.T) *env { e.testOrgRepo = testing.Getenv(t, "TEST_ORG_REPO", "") e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE") e.testJobs = createTestJobs(id, testResultCMNamePrefix, 100) - ephemeral, _ := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL")) + ephemeral, _ := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL", "")) e.featureFlagEphemeral = ephemeral e.scaleDownDelaySecondsAfterScaleOut, _ = strconv.ParseInt(testing.Getenv(t, "TEST_RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT", "10"), 10, 32) e.minReplicas, _ = strconv.ParseInt(testing.Getenv(t, "TEST_RUNNER_MIN_REPLICAS", "1"), 10, 32)