From 8573855e43d404b2d8043d27265c116233b52f94 Mon Sep 17 00:00:00 2001 From: Anthony Day Date: Mon, 3 Feb 2025 18:49:44 +1100 Subject: [PATCH] Add readiness probe to sidecar --- templates/cpu-check-configmap.yaml | 78 ++++++++++++++++++++++++++++++ templates/deployment.yaml | 17 +++++++ templates/rbac/role.yaml | 8 --- values.yaml | 15 ++++++ 4 files changed, 110 insertions(+), 8 deletions(-) create mode 100644 templates/cpu-check-configmap.yaml diff --git a/templates/cpu-check-configmap.yaml b/templates/cpu-check-configmap.yaml new file mode 100644 index 0000000..a3b0a67 --- /dev/null +++ b/templates/cpu-check-configmap.yaml @@ -0,0 +1,78 @@ +{{- /* +Copyright Cyral, Inc. +SPDX-License-Identifier: APACHE-2.0 +*/}} + +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "common.names.fullname" . }}-cpu-check + namespace: {{ include "common.names.namespace" . | quote }} + labels: {{- include "common.labels.standard" ( dict "customLabels" .Values.commonLabels "context" $ ) | nindent 4 }} +data: + check-cpu.sh: | + #!/bin/sh + + # Get number of CPU cores + NUM_CORES=$(nproc) + + # Get CPU request or limit from environment, treat 0 as unset + CPU_REQUEST=${CPU_REQUEST:-${CPU_LIMIT}} + if [ -z "$CPU_REQUEST" ] || [ "$CPU_REQUEST" -eq 0 ]; then + echo "Neither CPU_REQUEST nor CPU_LIMIT environment variables are set to a valid value" + exit 0 + fi + + # Get CPU stats from /proc/stat + CPU_LINE=$(grep '^cpu ' /proc/stat) + user=$(echo "$CPU_LINE" | awk '{print $2}') + nice=$(echo "$CPU_LINE" | awk '{print $3}') + system=$(echo "$CPU_LINE" | awk '{print $4}') + idle=$(echo "$CPU_LINE" | awk '{print $5}') + iowait=$(echo "$CPU_LINE" | awk '{print $6}') + irq=$(echo "$CPU_LINE" | awk '{print $7}') + softirq=$(echo "$CPU_LINE" | awk '{print $8}') + steal=$(echo "$CPU_LINE" | awk '{print $9}') + + total_cpu_time=$((user + nice + system + idle + iowait + irq + softirq + steal)) + idle_cpu_time=$((idle + iowait)) + + # Sleep for a second to get delta + sleep 1 + + # Get CPU stats again + CPU_LINE=$(grep '^cpu ' /proc/stat) + user_new=$(echo "$CPU_LINE" | awk '{print $2}') + nice_new=$(echo "$CPU_LINE" | awk '{print $3}') + system_new=$(echo "$CPU_LINE" | awk '{print $4}') + idle_new=$(echo "$CPU_LINE" | awk '{print $5}') + iowait_new=$(echo "$CPU_LINE" | awk '{print $6}') + irq_new=$(echo "$CPU_LINE" | awk '{print $7}') + softirq_new=$(echo "$CPU_LINE" | awk '{print $8}') + steal_new=$(echo "$CPU_LINE" | awk '{print $9}') + + total_cpu_time_new=$((user_new + nice_new + system_new + idle_new + iowait_new + irq_new + softirq_new + steal_new)) + idle_cpu_time_new=$((idle_new + iowait_new)) + + # Calculate CPU usage percentage across all cores + total_delta=$((total_cpu_time_new - total_cpu_time)) + idle_delta=$((idle_cpu_time_new - idle_cpu_time)) + CPU_USAGE=$(( 100 * (total_delta - idle_delta) / total_delta )) + + # Calculate total available millicores across all cores + TOTAL_MILLICORES=$((NUM_CORES * 1000)) + + # Convert CPU usage to millicores (considering all cores) + CPU_USAGE_MILLI=$(( CPU_USAGE * TOTAL_MILLICORES / 100 )) + + # Calculate 80% of the CPU request as threshold + CPU_THRESHOLD=$(( CPU_REQUEST * 80 / 100 )) + + # Compare CPU usage with threshold + if [ "$CPU_USAGE_MILLI" -gt "$CPU_THRESHOLD" ]; then + echo "CPU usage (${CPU_USAGE_MILLI}m) is above 80% of request (${CPU_THRESHOLD}m) across ${NUM_CORES} cores" + exit 1 + fi + + echo "CPU usage (${CPU_USAGE_MILLI}m) is below threshold (${CPU_THRESHOLD}m) across ${NUM_CORES} cores" + exit 0 diff --git a/templates/deployment.yaml b/templates/deployment.yaml index eee3e7d..1ac99f4 100644 --- a/templates/deployment.yaml +++ b/templates/deployment.yaml @@ -65,6 +65,10 @@ spec: emptyDir: {} - name: cyral-sidecar-ca-bundles emptyDir: {} + - name: cpu-check-script + configMap: + name: {{ include "common.names.fullname" . }}-cpu-check + defaultMode: 0755 initContainers: - name: init-sidecar image: {{ include "cyral.image" . }} @@ -90,6 +94,17 @@ spec: - name: cyral-sidecar image: {{ include "cyral.image" . }} imagePullPolicy: {{ .Values.image.pullPolicy | quote }} + {{- if .Values.readinessProbe.enabled }} + readinessProbe: + exec: + command: + - /scripts/check-cpu.sh + initialDelaySeconds: {{ .Values.readinessProbe.initialDelaySeconds }} + timeoutSeconds: {{ .Values.readinessProbe.failureThreshold }} + periodSeconds: {{ .Values.readinessProbe.periodSeconds }} + failureThreshold: {{ .Values.readinessProbe.failureThreshold }} + successThreshold: {{ .Values.readinessProbe.successThreshold }} + {{- end }} {{- if .Values.containerSecurityContext.enabled }} securityContext: {{- omit .Values.containerSecurityContext "enabled" | toYaml | nindent 12 }} {{- end }} @@ -199,6 +214,8 @@ spec: mountPath: /etc/nginx - name: cyral-sidecar-ca-bundles mountPath: /etc/cyral/cyral-certificate-manager/bundles + - name: cpu-check-script + mountPath: /scripts {{- if .Values.extraVolumeMounts }} {{- include "common.tplvalues.render" ( dict "value" .Values.extraVolumeMounts "context" $) | nindent 12 }} {{- end }} diff --git a/templates/rbac/role.yaml b/templates/rbac/role.yaml index 8b64be8..988ab92 100644 --- a/templates/rbac/role.yaml +++ b/templates/rbac/role.yaml @@ -27,14 +27,6 @@ rules: - "services" verbs: - "get" - - apiGroups: - - "metrics.k8s.io" - resources: - - "pods" - verbs: - - "get" - - "list" - - "watch" {{- if .Values.rbac.rules }} {{- include "common.tplvalues.render" ( dict "value" .Values.rbac.rules "context" $ ) | nindent 2 }} {{- end }} diff --git a/values.yaml b/values.yaml index 1a57577..7db066b 100644 --- a/values.yaml +++ b/values.yaml @@ -207,6 +207,21 @@ autoscaling: maxReplicas: "" targetCPU: "" targetMemory: "" +## ref: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#container-probes +## @param readinessProbe.enabled Enable readinessProbe +## @param readinessProbe.initialDelaySeconds Initial delay seconds for readinessProbe +## @param readinessProbe.periodSeconds Period seconds for readinessProbe +## @param readinessProbe.timeoutSeconds Timeout seconds for readinessProbe +## @param readinessProbe.failureThreshold Failure threshold for readinessProbe +## @param readinessProbe.successThreshold Success threshold for readinessProbe +## +readinessProbe: + enabled: false + initialDelaySeconds: 5 + timeoutSeconds: 3 + periodSeconds: 5 + failureThreshold: 3 + successThreshold: 1 ## Example: ## resources: ## requests: