Skip to content

Commit

Permalink
metrics: Wait for a minute before deleting metrics for deleted pods
Browse files Browse the repository at this point in the history
Instead of deleting metrics immediately after a pod is deleted, use a workqueue
to delay the deletion a minute. This allows the scraper to scrape last values
of the metrics. It's particularly useful when the cluster has short-lived pods
- with immediate deletion the scraper could completely miss metrics for them.

Signed-off-by: Anna Kapuscinska <anna@isovalent.com>
  • Loading branch information
lambdanis committed Aug 17, 2023
1 parent 5eba7b8 commit 4a39c4a
Showing 1 changed file with 27 additions and 1 deletion.
28 changes: 27 additions & 1 deletion pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package metrics
import (
"net/http"
"sync"
"time"

"github.com/cilium/tetragon/pkg/grpc/tracing"
"github.com/cilium/tetragon/pkg/logger"
Expand All @@ -26,10 +27,13 @@ import (
"github.com/prometheus/client_golang/prometheus/promhttp"
corev1 "k8s.io/api/core/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/util/workqueue"
)

var (
metricsWithPod []*prometheus.MetricVec
podQueue workqueue.DelayingInterface
deleteDelay = 1 * time.Minute
once sync.Once
)

Expand Down Expand Up @@ -59,14 +63,22 @@ func RegisterPodDeleteHandler() {
default:
return
}
DeleteMetricsForPod(pod)
queue := GetPodQueue()
queue.AddAfter(pod, deleteDelay)
},
},
)
},
})
}

func GetPodQueue() workqueue.DelayingInterface {
if podQueue == nil {
podQueue = workqueue.NewDelayingQueue()
}
return podQueue
}

// ListMetricsWithPod returns the global list of all metrics that have "pod"
// and "namespace" labels, initializing it if needed.
func ListMetricsWithPod() []*prometheus.MetricVec {
Expand Down Expand Up @@ -105,6 +117,20 @@ func InitAllMetrics(registry *prometheus.Registry) {
func EnableMetrics(address string) {
reg := prometheus.NewRegistry()
InitAllMetrics(reg)

// Start handling metrics deletion on pod delete events
go func() {
queue := GetPodQueue()
for {
pod, quit := queue.Get()
if quit {
return
}
DeleteMetricsForPod(pod.(*corev1.Pod))
}
}()

// Start the metrics server
logger.GetLogger().WithField("addr", address).Info("Starting metrics server")
http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{Registry: reg}))
http.ListenAndServe(address, nil)
Expand Down

0 comments on commit 4a39c4a

Please # to comment.