From 2e36ac33e2094eed7052ab8dc1561954a18ca3e3 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Fri, 22 Sep 2023 13:18:22 +0200 Subject: [PATCH] metrics: report metric errors when caching pids When caching process, report if pid and tid mismatch so we are aware of any bug that may affect BPF or userspace caching logic. Signed-off-by: Djalal Harouni --- pkg/grpc/exec/exec.go | 1 + pkg/metrics/errormetrics/errormetrics.go | 2 ++ pkg/process/process.go | 3 +++ 3 files changed, 6 insertions(+) diff --git a/pkg/grpc/exec/exec.go b/pkg/grpc/exec/exec.go index de605e0a56a..433f0efe74c 100644 --- a/pkg/grpc/exec/exec.go +++ b/pkg/grpc/exec/exec.go @@ -314,6 +314,7 @@ func GetProcessExit(event *MsgExitEventUnix) *tetragon.ProcessExit { "event.process.tid": event.Info.Tid, "event.process.binary": tetragonProcess.Binary, }).Warn("ExitEvent: process PID and TID mismatch") + errormetrics.ErrorTotalInc(errormetrics.ProcessPidTidMismatch) } tetragonEvent := &tetragon.ProcessExit{ diff --git a/pkg/metrics/errormetrics/errormetrics.go b/pkg/metrics/errormetrics/errormetrics.go index 4f117a04752..207e591f522 100644 --- a/pkg/metrics/errormetrics/errormetrics.go +++ b/pkg/metrics/errormetrics/errormetrics.go @@ -21,6 +21,8 @@ var ( ProcessCacheEvicted ErrorType = "process_cache_evicted" // Process not found on remove() call. ProcessCacheMissOnRemove ErrorType = "process_cache_miss_on_remove" + // Tid and Pid mismatch that could affect BPF and user space caching logic + ProcessPidTidMismatch ErrorType = "process_pid_tid_mismatch" // Event cache podInfo retries failed. EventCachePodInfoRetryFailed ErrorType = "event_cache_podinfo_retry_failed" // Event cache failed to set process information for an event. diff --git a/pkg/process/process.go b/pkg/process/process.go index 4f84279b91e..42878860d99 100644 --- a/pkg/process/process.go +++ b/pkg/process/process.go @@ -10,6 +10,7 @@ import ( "sync" "sync/atomic" + "github.com/cilium/tetragon/pkg/metrics/errormetrics" hubble "github.com/cilium/tetragon/pkg/oldhubble/cilium" "github.com/sirupsen/logrus" @@ -220,6 +221,7 @@ func initProcessInternalExec( }).Warn("ExecveEvent: process PID and TID mismatch") // Explicitly reset TID to be PID process.TID = process.PID + errormetrics.ErrorTotalInc(errormetrics.ProcessPidTidMismatch) } return &ProcessInternal{ process: &tetragon.Process{ @@ -273,6 +275,7 @@ func initProcessInternalClone(event *tetragonAPI.MsgCloneEvent, "event.process.exec_id": pi.process.ExecId, "event.parent.exec_id": parentExecId, }).Debug("CloneEvent: process PID and TID mismatch") + errormetrics.ErrorTotalInc(errormetrics.ProcessPidTidMismatch) } // Set the TID here and if we have an exit without an exec we report // directly this TID without copying again objects.