Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Alertmanager: Initialize skipped Grafana Alertmanagers receiving requests #10691

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
13 changes: 12 additions & 1 deletion cmd/mimir/config-descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -16024,13 +16024,24 @@
"kind": "field",
"name": "grafana_alertmanager_conditionally_skip_tenant_suffix",
"required": false,
"desc": "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.",
"desc": "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration or they are receiving alerts.",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldFlag": "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix",
"fieldType": "string",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "grafana_alertmanager_idle_grace_period",
"required": false,
"desc": "Duration to wait before shutting down an idle Alertmanager for a tenant that matches grafana-alertmanager-conditionally-skip-tenant-suffix and is using an unpromoted or default configuration.",
"fieldValue": null,
"fieldDefaultValue": 300000000000,
"fieldFlag": "alertmanager.grafana-alertmanager-grace-period",
"fieldType": "duration",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "max_concurrent_get_requests_per_tenant",
Expand Down
4 changes: 3 additions & 1 deletion cmd/mimir/help-all.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,9 @@ Usage of ./cmd/mimir/mimir:
-alertmanager.grafana-alertmanager-compatibility-enabled
[experimental] Enable routes to support the migration and operation of the Grafana Alertmanager.
-alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix string
[experimental] Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.
[experimental] Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration or they are receiving alerts.
-alertmanager.grafana-alertmanager-grace-period duration
[experimental] Duration to wait before shutting down an idle Alertmanager for a tenant that matches grafana-alertmanager-conditionally-skip-tenant-suffix and is using an unpromoted or default configuration. (default 5m0s)
-alertmanager.log-parsing-label-matchers
[experimental] Enable logging when parsing label matchers. This flag is intended to be used with -alertmanager.utf8-strict-mode-enabled to validate UTF-8 strict mode is working as intended.
-alertmanager.max-alerts-count int
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2520,10 +2520,17 @@ sharding_ring:
[grafana_alertmanager_compatibility_enabled: <boolean> | default = false]

# (experimental) Skip starting the Alertmanager for tenants matching this suffix
# unless they have a promoted, non-default Grafana Alertmanager configuration.
# unless they have a promoted, non-default Grafana Alertmanager configuration or
# they are receiving alerts.
# CLI flag: -alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix
[grafana_alertmanager_conditionally_skip_tenant_suffix: <string> | default = ""]

# (experimental) Duration to wait before shutting down an idle Alertmanager for
# a tenant that matches grafana-alertmanager-conditionally-skip-tenant-suffix
# and is using an unpromoted or default configuration.
# CLI flag: -alertmanager.grafana-alertmanager-grace-period
[grafana_alertmanager_idle_grace_period: <duration> | default = 5m]

# (advanced) Maximum number of concurrent GET requests allowed per tenant. The
# zero value (and negative values) result in a limit of GOMAXPROCS or 8,
# whichever is larger. Status code 503 is served for GET requests that would
Expand Down
1 change: 0 additions & 1 deletion pkg/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ type Config struct {
PersisterConfig PersisterConfig

GrafanaAlertmanagerCompatibility bool
GrafanaAlertmanagerTenantSuffix string
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated fix, this was not being used here.

}

// An Alertmanager manages the alerts for one user.
Expand Down
102 changes: 93 additions & 9 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ var (
errInvalidExternalURLMissingHostname = errors.New("the configured external URL is invalid because it's missing the hostname")
errZoneAwarenessEnabledWithoutZoneInfo = errors.New("the configured alertmanager has zone awareness enabled but zone is not set")
errNotUploadingFallback = errors.New("not uploading fallback configuration")

zeroTimeUnix = time.Time{}.Unix()
)

// MultitenantAlertmanagerConfig is the configuration for a multitenant Alertmanager.
Expand All @@ -83,8 +85,9 @@ type MultitenantAlertmanagerConfig struct {

EnableAPI bool `yaml:"enable_api" category:"advanced"`

GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"`
GrafanaAlertmanagerTenantSuffix string `yaml:"grafana_alertmanager_conditionally_skip_tenant_suffix" category:"experimental"`
GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"`
GrafanaAlertmanagerTenantSuffix string `yaml:"grafana_alertmanager_conditionally_skip_tenant_suffix" category:"experimental"`
GrafanaAlertmanagerIdleGracePeriod time.Duration `yaml:"grafana_alertmanager_idle_grace_period" category:"experimental"`

MaxConcurrentGetRequestsPerTenant int `yaml:"max_concurrent_get_requests_per_tenant" category:"advanced"`

Expand All @@ -111,7 +114,8 @@ type MultitenantAlertmanagerConfig struct {
}

const (
defaultPeerTimeout = 15 * time.Second
defaultGrafanaAlertmanagerGracePeriod = 5 * time.Minute
defaultPeerTimeout = 15 * time.Second
)

// RegisterFlags adds the features required to config this to the given FlagSet.
Expand All @@ -128,7 +132,8 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet, logger

f.BoolVar(&cfg.EnableAPI, "alertmanager.enable-api", true, "Enable the alertmanager config API.")
f.BoolVar(&cfg.GrafanaAlertmanagerCompatibilityEnabled, "alertmanager.grafana-alertmanager-compatibility-enabled", false, "Enable routes to support the migration and operation of the Grafana Alertmanager.")
f.StringVar(&cfg.GrafanaAlertmanagerTenantSuffix, "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix", "", "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.")
f.StringVar(&cfg.GrafanaAlertmanagerTenantSuffix, "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix", "", "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration or they are receiving alerts.")
f.DurationVar(&cfg.GrafanaAlertmanagerIdleGracePeriod, "alertmanager.grafana-alertmanager-grace-period", defaultGrafanaAlertmanagerGracePeriod, "Duration to wait before shutting down an idle Alertmanager for a tenant that matches grafana-alertmanager-conditionally-skip-tenant-suffix and is using an unpromoted or default configuration.")
f.IntVar(&cfg.MaxConcurrentGetRequestsPerTenant, "alertmanager.max-concurrent-get-requests-per-tenant", 0, "Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.")

f.BoolVar(&cfg.EnableStateCleanup, "alertmanager.enable-state-cleanup", true, "Enables periodic cleanup of alertmanager stateful data (notification logs and silences) from object storage. When enabled, data is removed for any tenant that does not have a configuration.")
Expand Down Expand Up @@ -189,6 +194,7 @@ type multitenantAlertmanagerMetrics struct {
grafanaStateSize *prometheus.GaugeVec
lastReloadSuccessful *prometheus.GaugeVec
lastReloadSuccessfulTimestamp *prometheus.GaugeVec
tenantsSkipped prometheus.Gauge
}

func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAlertmanagerMetrics {
Expand All @@ -212,6 +218,12 @@ func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAl
Help: "Timestamp of the last successful configuration reload.",
}, []string{"user"})

m.tenantsSkipped = promauto.With(reg).NewGauge(prometheus.GaugeOpts{
Namespace: "cortex",
Name: "alertmanager_tenants_skipped",
Help: "Number of per-tenant alertmanagers that were skipped during the last configuration sync.",
})

return m
}

Expand Down Expand Up @@ -318,6 +330,10 @@ type MultitenantAlertmanager struct {
limits Limits
features featurecontrol.Flagger

// Record the last time we received a request for a given Grafana tenant.
// We can shut down an idle Alertmanager after the grace period elapses.
receivingRequests sync.Map

registry prometheus.Registerer
ringCheckErrors prometheus.Counter
tenantsOwned prometheus.Gauge
Expand Down Expand Up @@ -390,6 +406,7 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC
fallbackConfig: string(fallbackConfig),
cfgs: map[string]alertspb.AlertConfigDesc{},
alertmanagers: map[string]*Alertmanager{},
receivingRequests: sync.Map{},
alertmanagerMetrics: newAlertmanagerMetrics(logger),
multitenantMetrics: newMultitenantAlertmanagerMetrics(registerer),
store: store,
Expand Down Expand Up @@ -695,6 +712,7 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s
am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(1))
am.multitenantMetrics.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime()
}
am.multitenantMetrics.tenantsSkipped.Set(float64(len(amInitSkipped)))

userAlertmanagersToStop := map[string]*Alertmanager{}
am.alertmanagersMtx.Lock()
Expand Down Expand Up @@ -727,12 +745,31 @@ func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs)
AlertConfigDesc: cfgs.Mimir,
tmplExternalURL: am.cfg.ExternalURL.URL,
}
strictInit := am.cfg.GrafanaAlertmanagerTenantSuffix != "" && strings.HasSuffix(cfgs.Mimir.User, am.cfg.GrafanaAlertmanagerTenantSuffix)

// If the Grafana configuration is either default, not promoted, or empty, use the Mimir configuration.
// A Grafana configuration is considered usable if it's promoted, non-default, and not empty.
if !cfgs.Grafana.Promoted || cfgs.Grafana.Default || cfgs.Grafana.RawConfig == "" {
level.Debug(am.logger).Log("msg", "using mimir config", "user", cfgs.Mimir.User)
isGrafanaTenant := am.cfg.GrafanaAlertmanagerTenantSuffix != "" && strings.HasSuffix(cfgs.Mimir.User, am.cfg.GrafanaAlertmanagerTenantSuffix)
return cfg, !isGrafanaTenant, nil
if !strictInit {
return cfg, true, nil
}

// If the tenant ID matches the configured Grafana suffix, only run the Alertmanager if it's receiving requests.
createdAt, ok := am.receivingRequests.Load(cfgs.Mimir.User)
if !ok || time.Since(time.Unix(createdAt.(int64), 0)) >= am.cfg.GrafanaAlertmanagerIdleGracePeriod {
// Use the zero-value to indicate that we've skipped the tenant.
am.receivingRequests.Store(cfgs.Mimir.User, zeroTimeUnix)
return cfg, false, nil
}

level.Debug(am.logger).Log("msg", "user has no usable config but is receiving alerts, keeping Alertmanager active", "user", cfgs.Mimir.User)
return cfg, true, nil
}

// If the Alertmanager was previously skipped but now has a usable configuration, remove it from the skipped list.
if strictInit {
if _, ok := am.receivingRequests.LoadAndDelete(cfgs.Mimir.User); ok {
level.Debug(am.logger).Log("msg", "user has now a usable config, removing it from skipped list", "user", cfgs.Mimir.User)
}
}

// If the Mimir configuration is either default or empty, use the Grafana configuration.
Expand Down Expand Up @@ -933,7 +970,6 @@ func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *defi
Limits: am.limits,
Features: am.features,
GrafanaAlertmanagerCompatibility: am.cfg.GrafanaAlertmanagerCompatibilityEnabled,
GrafanaAlertmanagerTenantSuffix: am.cfg.GrafanaAlertmanagerTenantSuffix,
}, reg)
if err != nil {
return nil, fmt.Errorf("unable to start Alertmanager for user %v: %v", userID, err)
Expand Down Expand Up @@ -1002,6 +1038,27 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http

if ok {
userAM.mux.ServeHTTP(w, req)

// If needed, update the last time the Alertmanager received requests.
if _, ok := am.receivingRequests.Load(userID); ok {
level.Debug(am.logger).Log("msg", "updating last alert reception time", "user", userID)
am.receivingRequests.Store(userID, time.Now().Unix())
}
return
}

// If the Alertmanager initialization was skipped, start the Alertmanager.
if _, ok := am.receivingRequests.Load(userID); ok {
userAM, err = am.startAlertmanager(req.Context(), userID)
if err != nil {
level.Error(am.logger).Log("msg", "unable to initialize the Alertmanager", "user", userID, "err", err)
http.Error(w, "Failed to initialize the Alertmanager", http.StatusInternalServerError)
return
}

am.receivingRequests.Store(userID, time.Now().Unix())
level.Debug(am.logger).Log("msg", "Alertmanager initialized after receiving request", "user", userID)
userAM.mux.ServeHTTP(w, req)
return
}

Expand All @@ -1025,6 +1082,33 @@ func (am *MultitenantAlertmanager) serveRequest(w http.ResponseWriter, req *http
http.Error(w, "the Alertmanager is not configured", http.StatusPreconditionFailed)
}

// startAlertmanager will start the Alertmanager for a tenant, using the fallback configuration if no config is found.
func (am *MultitenantAlertmanager) startAlertmanager(ctx context.Context, userID string) (*Alertmanager, error) {
// Avoid starting the Alertmanager for tenants not owned by this instance.
if !am.isUserOwned(userID) {
return nil, errors.Wrap(errNotUploadingFallback, "user not owned by this instance")
}

cfg, err := am.store.GetAlertConfig(ctx, userID)
if err != nil {
if !errors.Is(err, alertspb.ErrNotFound) {
return nil, errors.Wrap(err, "failed to check for existing configuration")
}
cfg = alertspb.ToProto("", nil, userID)
}

amConfig := amConfig{
AlertConfigDesc: cfg,
tmplExternalURL: am.cfg.ExternalURL.URL,
}
if err := am.setConfig(amConfig); err != nil {
return nil, err
}
am.alertmanagersMtx.Lock()
defer am.alertmanagersMtx.Unlock()
return am.alertmanagers[userID], nil
}

func (am *MultitenantAlertmanager) alertmanagerFromFallbackConfig(ctx context.Context, userID string) (*Alertmanager, error) {
// Make sure we never create fallback instances for a user not owned by this instance.
// This check is not strictly necessary as the configuration polling loop will deactivate
Expand Down
Loading
Loading