Skip to content

Commit

Permalink
Use ProgressDeadline as failure timeout for startup probe
Browse files Browse the repository at this point in the history
- Also just drop exec probe entirely for periodSeconds > 1 since these
  can just use the readiness probe now. (Easier than figuring out how to
  do square ProgressDeadline with a custom period).
  • Loading branch information
julz committed Feb 26, 2021
1 parent 1c03c42 commit 264d2e9
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 105 deletions.
5 changes: 0 additions & 5 deletions cmd/queue/execprobe.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ import (
network "knative.dev/networking/pkg"
"knative.dev/serving/pkg/queue"
"knative.dev/serving/pkg/queue/health"
"knative.dev/serving/pkg/queue/readiness"
)

const (
Expand Down Expand Up @@ -76,10 +75,6 @@ func standaloneProbeMain(timeout time.Duration, transport http.RoundTripper) (ex
return 1
}

if timeout == 0 {
timeout = readiness.PollTimeout
}

if err := probeQueueHealthPath(timeout, int(queueServingPort), transport); err != nil {
fmt.Fprintln(os.Stderr, err)
return 1
Expand Down
6 changes: 3 additions & 3 deletions cmd/queue/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ const (
)

var (
readinessProbeTimeout = flag.Duration("probe-period", -1, "run readiness probe with given timeout")
startupProbeTimeout = flag.Duration("probe-timeout", -1, "run startup probe with given timeout")

// This creates an abstract socket instead of an actual file.
unixSocketPath = "@/knative.dev/serving/queue.sock"
Expand Down Expand Up @@ -107,15 +107,15 @@ func main() {
flag.Parse()

// If this is set, we run as a standalone binary to probe the queue-proxy.
if *readinessProbeTimeout >= 0 {
if *startupProbeTimeout >= 0 {
// Use a unix socket rather than TCP to avoid going via entire TCP stack
// when we're actually in the same container.
transport := http.DefaultTransport.(*http.Transport).Clone()
transport.DialContext = func(_ context.Context, _, _ string) (net.Conn, error) {
return net.Dial("unix", unixSocketPath)
}

os.Exit(standaloneProbeMain(*readinessProbeTimeout, transport))
os.Exit(standaloneProbeMain(*startupProbeTimeout, transport))
}

// Otherwise, we run as the queue-proxy service.
Expand Down
8 changes: 5 additions & 3 deletions pkg/reconciler/revision/resources/deploy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,13 @@ var (
StartupProbe: &corev1.Probe{
Handler: corev1.Handler{
Exec: &corev1.ExecAction{
Command: []string{"/ko-app/queue", "-probe-period", "0"},
Command: []string{"/ko-app/queue", "-probe-timeout", "5678s"},
},
},
PeriodSeconds: 10,
TimeoutSeconds: 10,
PeriodSeconds: 1,
FailureThreshold: 1,
SuccessThreshold: 1,
TimeoutSeconds: 5678,
},
ReadinessProbe: &corev1.Probe{
Handler: corev1.Handler{
Expand Down
71 changes: 26 additions & 45 deletions pkg/reconciler/revision/resources/queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"fmt"
"math"
"strconv"
"time"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
Expand Down Expand Up @@ -171,56 +170,38 @@ func fractionFromPercentage(m map[string]string, k string) (float64, bool) {
return value / 100, err == nil
}

func makeExecProbe(in *corev1.Probe) *corev1.Probe {
if in == nil || in.PeriodSeconds == 0 {
out := &corev1.Probe{
Handler: corev1.Handler{
Exec: &corev1.ExecAction{
Command: []string{"/ko-app/queue", "-probe-period", "0"},
},
},
// The exec probe enables us to retry failed probes quickly to get sub-second
// resolution and achieve faster cold-starts. However, for draining pods as
// part of the K8s lifecycle this period will bound the tail of how quickly
// we can remove a Pod's endpoint from the K8s service.
//
// The trade-off here is that exec probes cost CPU to run, and for idle pods
// (e.g. due to minScale) we see ~50m/{period} of idle CPU usage in the
// queue-proxy. So while setting this to 1s results in slightly faster drains
// it also means that in the steady state the queue-proxy is consuming 10x
// more CPU due to probes than with a period of 10s.
//
// See also: https://github.com/knative/serving/issues/8147
PeriodSeconds: 10,
// We keep the connection open for a while because we're
// actively probing the user-container on that endpoint and
// thus don't want to be limited by K8s granularity here.
TimeoutSeconds: 10,
}

if in != nil {
out.InitialDelaySeconds = in.InitialDelaySeconds
}
return out
func makeStartupExecProbe(in *corev1.Probe, progressDeadlineSeconds int) *corev1.Probe {
if in != nil && in.PeriodSeconds > 0 {
// If the user opted-out of the aggressive probing optimisation we don't
// need to run a startup probe at all.
return nil
}

timeout := time.Second
if in.TimeoutSeconds > 1 {
timeout = time.Duration(in.TimeoutSeconds) * time.Second
}

return &corev1.Probe{
out := &corev1.Probe{
Handler: corev1.Handler{
Exec: &corev1.ExecAction{
Command: []string{"/ko-app/queue", "-probe-period", timeout.String()},
Command: []string{"/ko-app/queue", "-probe-timeout", fmt.Sprintf("%ds", progressDeadlineSeconds)},
},
},
PeriodSeconds: in.PeriodSeconds,
TimeoutSeconds: int32(timeout.Seconds()),
SuccessThreshold: in.SuccessThreshold,
FailureThreshold: in.FailureThreshold,
InitialDelaySeconds: in.InitialDelaySeconds,
// We keep the connection open for a while because we're actively probing
// the user-container on that endpoint and thus don't want to be limited
// by K8s granularity here.
// The exec probe is run as a startup probe so the container will be killed
// and restarted if it fails. We use the ProgressDeadline as the timeout
// here to match the time we'll wait before killing the revision if it
// fails to go ready on initial deployment.
TimeoutSeconds: int32(progressDeadlineSeconds),
// The probe itself retries aggressively so there's no point retrying here too.
FailureThreshold: 1,
SuccessThreshold: 1,
PeriodSeconds: 1,
}

if in != nil {
out.InitialDelaySeconds = in.InitialDelaySeconds
}

return out
}

// makeQueueContainer creates the container spec for the queue sidecar.
Expand Down Expand Up @@ -263,7 +244,7 @@ func makeQueueContainer(rev *v1.Revision, cfg *config.Config) (*corev1.Container
// variable for this probe to use.
userProbe := container.ReadinessProbe.DeepCopy()
applyReadinessProbeDefaultsForExec(userProbe, userPort)
execProbe := makeExecProbe(userProbe)
execProbe := makeStartupExecProbe(userProbe, int(cfg.Deployment.ProgressDeadline.Seconds()))
userProbeJSON, err := readiness.EncodeProbe(userProbe)
if err != nil {
return nil, fmt.Errorf("failed to serialize readiness probe: %w", err)
Expand Down
Loading

0 comments on commit 264d2e9

Please # to comment.