From 4a39c4a6fddc8410edcfc46275395533a4f9df72 Mon Sep 17 00:00:00 2001 From: Anna Kapuscinska Date: Tue, 8 Aug 2023 22:38:35 +0100 Subject: [PATCH] metrics: Wait for a minute before deleting metrics for deleted pods Instead of deleting metrics immediately after a pod is deleted, use a workqueue to delay the deletion a minute. This allows the scraper to scrape last values of the metrics. It's particularly useful when the cluster has short-lived pods - with immediate deletion the scraper could completely miss metrics for them. Signed-off-by: Anna Kapuscinska --- pkg/metrics/metrics.go | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 73fc9c75f54..6ce6ef72fab 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -6,6 +6,7 @@ package metrics import ( "net/http" "sync" + "time" "github.com/cilium/tetragon/pkg/grpc/tracing" "github.com/cilium/tetragon/pkg/logger" @@ -26,10 +27,13 @@ import ( "github.com/prometheus/client_golang/prometheus/promhttp" corev1 "k8s.io/api/core/v1" "k8s.io/client-go/tools/cache" + "k8s.io/client-go/util/workqueue" ) var ( metricsWithPod []*prometheus.MetricVec + podQueue workqueue.DelayingInterface + deleteDelay = 1 * time.Minute once sync.Once ) @@ -59,7 +63,8 @@ func RegisterPodDeleteHandler() { default: return } - DeleteMetricsForPod(pod) + queue := GetPodQueue() + queue.AddAfter(pod, deleteDelay) }, }, ) @@ -67,6 +72,13 @@ func RegisterPodDeleteHandler() { }) } +func GetPodQueue() workqueue.DelayingInterface { + if podQueue == nil { + podQueue = workqueue.NewDelayingQueue() + } + return podQueue +} + // ListMetricsWithPod returns the global list of all metrics that have "pod" // and "namespace" labels, initializing it if needed. func ListMetricsWithPod() []*prometheus.MetricVec { @@ -105,6 +117,20 @@ func InitAllMetrics(registry *prometheus.Registry) { func EnableMetrics(address string) { reg := prometheus.NewRegistry() InitAllMetrics(reg) + + // Start handling metrics deletion on pod delete events + go func() { + queue := GetPodQueue() + for { + pod, quit := queue.Get() + if quit { + return + } + DeleteMetricsForPod(pod.(*corev1.Pod)) + } + }() + + // Start the metrics server logger.GetLogger().WithField("addr", address).Info("Starting metrics server") http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{Registry: reg})) http.ListenAndServe(address, nil)