From 27b9f6e3b11fadb90167d61fbe7a6a8ed1dae7e5 Mon Sep 17 00:00:00 2001 From: Anna Kapuscinska Date: Sun, 8 Sep 2024 15:52:17 +0100 Subject: [PATCH 1/2] Move ratelimitmetrics inside pkg/exporter Tetragon exposes a counter of events rate limited on export. Let's move this metric inside the exporter package, so that it's clear what it measures. Considering possible future development: * If there are multiple exporters, rate limits from all of them will be included in the same counter (no change) * If there are rate limiters not belonging to any exporter, their drops won't be counted (this changes here - before all drops by all rate limiters would be included in the metric) Mixing together drops from different exporters/rate limiters might be misleading, but not exposing some drops at all is problematic too. If this becomes an issue, a better solution would be probably exposing a counter per rate limiter (e.g. reuse existing RateLimiter.dropped field), labeled with a rate limiter identifier. But for now it seems a premature optimization. Signed-off-by: Anna Kapuscinska --- pkg/exporter/exporter.go | 1 + pkg/exporter/metrics.go | 16 ++++++++++--- .../ratelimitmetrics/ratelimitmetrics.go | 23 ------------------- pkg/metricsconfig/healthmetrics.go | 3 --- pkg/ratelimit/ratelimit.go | 2 -- 5 files changed, 14 insertions(+), 31 deletions(-) delete mode 100644 pkg/metrics/ratelimitmetrics/ratelimitmetrics.go diff --git a/pkg/exporter/exporter.go b/pkg/exporter/exporter.go index bc8bf072902..2d21a0e709b 100644 --- a/pkg/exporter/exporter.go +++ b/pkg/exporter/exporter.go @@ -56,6 +56,7 @@ func (e *Exporter) Start() error { func (e *Exporter) Send(event *tetragon.GetEventsResponse) error { if e.rateLimiter != nil && !e.rateLimiter.Allow() { e.rateLimiter.Drop() + rateLimitDropped.Inc() return nil } diff --git a/pkg/exporter/metrics.go b/pkg/exporter/metrics.go index a95db1f0392..5a1a32cd532 100644 --- a/pkg/exporter/metrics.go +++ b/pkg/exporter/metrics.go @@ -29,12 +29,22 @@ var ( Name: "events_last_exported_timestamp", Help: "Timestamp of the most recent event to be exported", }) + + rateLimitDropped = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: consts.MetricsNamespace, + Name: "ratelimit_dropped_total", + Help: "The total number of rate limit Tetragon drops", + ConstLabels: nil, + }) ) func RegisterMetrics(group metrics.Group) { - group.MustRegister(eventsExportedTotal) - group.MustRegister(eventsExportedBytesTotal) - group.MustRegister(eventsExportTimestamp) + group.MustRegister( + eventsExportedTotal, + eventsExportedBytesTotal, + eventsExportTimestamp, + rateLimitDropped, + ) } func newExportedBytesCounterWriter(w io.Writer, c prometheus.Counter) io.Writer { diff --git a/pkg/metrics/ratelimitmetrics/ratelimitmetrics.go b/pkg/metrics/ratelimitmetrics/ratelimitmetrics.go deleted file mode 100644 index 86daba4952c..00000000000 --- a/pkg/metrics/ratelimitmetrics/ratelimitmetrics.go +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Copyright Authors of Tetragon - -package ratelimitmetrics - -import ( - "github.com/cilium/tetragon/pkg/metrics" - "github.com/cilium/tetragon/pkg/metrics/consts" - "github.com/prometheus/client_golang/prometheus" -) - -var ( - RateLimitDropped = prometheus.NewCounter(prometheus.CounterOpts{ - Namespace: consts.MetricsNamespace, - Name: "ratelimit_dropped_total", - Help: "The total number of rate limit Tetragon drops", - ConstLabels: nil, - }) -) - -func RegisterMetrics(group metrics.Group) { - group.MustRegister(RateLimitDropped) -} diff --git a/pkg/metricsconfig/healthmetrics.go b/pkg/metricsconfig/healthmetrics.go index 612149ce34a..53bdf3300d6 100644 --- a/pkg/metricsconfig/healthmetrics.go +++ b/pkg/metricsconfig/healthmetrics.go @@ -17,7 +17,6 @@ import ( "github.com/cilium/tetragon/pkg/metrics/opcodemetrics" "github.com/cilium/tetragon/pkg/metrics/policyfiltermetrics" "github.com/cilium/tetragon/pkg/metrics/policystatemetrics" - "github.com/cilium/tetragon/pkg/metrics/ratelimitmetrics" "github.com/cilium/tetragon/pkg/metrics/watchermetrics" "github.com/cilium/tetragon/pkg/observer" "github.com/cilium/tetragon/pkg/process" @@ -81,8 +80,6 @@ func registerHealthMetrics(group metrics.Group) { // tracing metrics tracing.RegisterMetrics(group) group.ExtendInit(tracing.InitMetrics) - // rate limit metrics - ratelimitmetrics.RegisterMetrics(group) // exporter metrics exporter.RegisterMetrics(group) // cgrup rate metrics diff --git a/pkg/ratelimit/ratelimit.go b/pkg/ratelimit/ratelimit.go index 4043952258b..52c5512a4e2 100644 --- a/pkg/ratelimit/ratelimit.go +++ b/pkg/ratelimit/ratelimit.go @@ -11,7 +11,6 @@ import ( "github.com/cilium/tetragon/api/v1/tetragon" "github.com/cilium/tetragon/pkg/encoder" "github.com/cilium/tetragon/pkg/logger" - "github.com/cilium/tetragon/pkg/metrics/ratelimitmetrics" "github.com/cilium/tetragon/pkg/reader/node" "golang.org/x/time/rate" "google.golang.org/protobuf/types/known/timestamppb" @@ -78,5 +77,4 @@ func (r *RateLimiter) reportRateLimitInfo(encoder encoder.EventEncoder) { func (r *RateLimiter) Drop() { atomic.AddUint64(&r.dropped, 1) - ratelimitmetrics.RateLimitDropped.Inc() } From c64c5939ea9dd528c2fbe95793fa8f4dbff54021 Mon Sep 17 00:00:00 2001 From: Anna Kapuscinska Date: Sun, 8 Sep 2024 16:23:29 +0100 Subject: [PATCH 2/2] exporter: Rename ratelimit drops counter There are multiple places where events can be rate limited, and potentially there could be multiple things being rate limited. Let's make it clear from the metric name and description what and where is being rate limitted. Signed-off-by: Anna Kapuscinska --- contrib/upgrade-notes/latest.md | 2 +- docs/content/en/docs/reference/metrics.md | 8 ++++---- pkg/exporter/metrics.go | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/contrib/upgrade-notes/latest.md b/contrib/upgrade-notes/latest.md index 6781dd46772..bebca62b221 100644 --- a/contrib/upgrade-notes/latest.md +++ b/contrib/upgrade-notes/latest.md @@ -23,4 +23,4 @@ Depending on your setup, changes listed here might require a manual intervention ### Metrics -* TBD +* `tetragon_ratelimit_dropped_total` metric is renamed to `tetragon_export_ratelimit_events_dropped_total` diff --git a/docs/content/en/docs/reference/metrics.md b/docs/content/en/docs/reference/metrics.md index 381ace5eeef..b3b750fc593 100644 --- a/docs/content/en/docs/reference/metrics.md +++ b/docs/content/en/docs/reference/metrics.md @@ -103,6 +103,10 @@ Timestamp of the most recent event to be exported Number of events missing process info. +### `tetragon_export_ratelimit_events_dropped_total` + +Number of events dropped on export due to rate limiting + ### `tetragon_flags_total` The total number of Tetragon flags. For internal use only. @@ -263,10 +267,6 @@ Process Loader event statistics. For internal use only. | ----- | ------ | | `count` | `LoaderReceived, LoaderResolvedImm, LoaderResolvedRetry` | -### `tetragon_ratelimit_dropped_total` - -The total number of rate limit Tetragon drops - ### `tetragon_tracingpolicy_loaded` The number of loaded tracing policy by state. diff --git a/pkg/exporter/metrics.go b/pkg/exporter/metrics.go index 5a1a32cd532..40e804f7a15 100644 --- a/pkg/exporter/metrics.go +++ b/pkg/exporter/metrics.go @@ -32,8 +32,8 @@ var ( rateLimitDropped = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: consts.MetricsNamespace, - Name: "ratelimit_dropped_total", - Help: "The total number of rate limit Tetragon drops", + Name: "export_ratelimit_events_dropped_total", + Help: "Number of events dropped on export due to rate limiting", ConstLabels: nil, }) )