Skip to content

Commit

Permalink
Support kubernetes controller manager SLI metrics (#15914)
Browse files Browse the repository at this point in the history
* Add support for SLI metrics in controller

* fixup! Add support for SLI metrics in controller

* Use get requests to probe for /slis endpoint

* Edit typo in documentation for kube controller manager SLI metrics

* Add changelog for kube controller manager

* Format kube controller manager code

* Update tests to mock get request instead of head

* Update sli unit tests

* Change SLI metrics name tag and filter by type for kcm

* Revert changes to kube scheduler

* Remove extra empty line in changelog

* Remove type tag from samples

* Use new changelog format

* Set SLI scraper config per instance

* Remove redundant  value in .get
  • Loading branch information
jennchenn authored Nov 9, 2023
1 parent 3d69b94 commit dc2bdc8
Show file tree
Hide file tree
Showing 9 changed files with 367 additions and 3 deletions.
1 change: 1 addition & 0 deletions kube_controller_manager/changelog.d/15914.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add support for kube_controller_manager SLI metrics
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@
from datadog_checks.base.config import is_affirmative
from datadog_checks.base.utils.http import RequestsWrapper

from .sli_metrics import SliMetricsScraperMixin

NEW_1_24_COUNTERS = {
# This metric replaces the deprecated node_collector_evictions_number metric as of k8s v1.24+
'node_collector_evictions_total': 'nodes.evictions',
}


class KubeControllerManagerCheck(KubeLeaderElectionMixin, OpenMetricsBaseCheck):
class KubeControllerManagerCheck(KubeLeaderElectionMixin, SliMetricsScraperMixin, OpenMetricsBaseCheck):
DEFAULT_METRIC_LIMIT = 0
DEFAULT_IGNORE_DEPRECATED = False

Expand Down Expand Up @@ -149,6 +151,13 @@ def __init__(self, name, init_config, instances):

instance['health_url'] = url

slis_instance = self.create_sli_prometheus_instance(instance)
instance['sli_scraper_config'] = self.get_scraper_config(slis_instance)
if instance.get('slis_available') is None:
instance['slis_available'] = self.detect_sli_endpoint(
self.get_http_handler(instance['sli_scraper_config']), slis_instance.get('prometheus_url')
)

def check(self, instance):
# Get the configuration for this specific instance
scraper_config = self.get_scraper_config(instance)
Expand Down Expand Up @@ -182,6 +191,10 @@ def check(self, instance):

self._perform_service_check(instance)

if instance.get('sli_scraper_config') and instance.get('slis_available'):
self.log.debug('Processing kube controller manager SLI metrics')
self.process(instance['sli_scraper_config'], metric_transformers=self.sli_transformers)

def _ignore_deprecated_metric(self, metric, scraper_config):
return metric.documentation.startswith("(Deprecated)")

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# (C) Datadog, Inc. 2023-present
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)
from __future__ import division

from copy import deepcopy

from datadog_checks.base.checks.openmetrics import OpenMetricsBaseCheck

SLI_METRICS_PATH = '/slis'

SLI_METRICS_MAP = {
'kubernetes_healthcheck': 'kubernetes_healthcheck',
'kubernetes_healthchecks_total': 'kubernetes_healthchecks_total',
}


class SliMetricsScraperMixin(OpenMetricsBaseCheck):
"""
This class scrapes metrics for the kube controller manager "/metrics/sli" prometheus endpoint and submits them on
behalf of a check.
"""

def __init__(self, *args, **kwargs):
super(SliMetricsScraperMixin, self).__init__(*args, **kwargs)
self.sli_transformers = {
'kubernetes_healthcheck': self.sli_metrics_transformer,
'kubernetes_healthchecks_total': self.sli_metrics_transformer,
}

def create_sli_prometheus_instance(self, instance):
"""
Create a copy of the instance and set default values.
This is so the base class can create a scraper_config with the proper values.
"""
KUBE_CONTROLLER_MANAGER_SLI_NAMESPACE = "kube_controller_manager.slis"

sli_instance = deepcopy(instance)
sli_instance.update(
{
'namespace': KUBE_CONTROLLER_MANAGER_SLI_NAMESPACE,
'prometheus_url': instance.get('prometheus_url') + SLI_METRICS_PATH,
}
)
return sli_instance

def detect_sli_endpoint(self, http_handler, url):
"""
Whether the SLI metrics endpoint is available (k8s 1.26+).
:return: true if the endpoint returns 200, false otherwise.
"""
try:
r = http_handler.get(url, stream=True)
except Exception as e:
self.log.debug("Error querying SLIs endpoint: %s", e)
return False
if r.status_code == 403:
self.log.debug(
"The /metrics/slis endpoint was introduced in Kubernetes v1.26. If you expect to see SLI metrics, \
please check that your permissions are configured properly."
)
return r.status_code == 200

def sli_metrics_transformer(self, metric, scraper_config):
modified_metric = deepcopy(metric)
modified_metric.samples = []

for sample in metric.samples:
metric_type = sample[self.SAMPLE_LABELS]["type"]
if metric_type == "healthz":
self._rename_sli_tag(sample, "sli_name", "name")
self._remove_tag(sample, "type")
modified_metric.samples.append(sample)
else:
self.log.debug("Skipping metric with type `%s`", metric_type)
self.submit_openmetric(SLI_METRICS_MAP[modified_metric.name], modified_metric, scraper_config)

def _rename_sli_tag(self, sample, new_tag_name, old_tag_name):
sample[self.SAMPLE_LABELS][new_tag_name] = sample[self.SAMPLE_LABELS][old_tag_name]
del sample[self.SAMPLE_LABELS][old_tag_name]

def _remove_tag(self, sample, tag_name):
del sample[self.SAMPLE_LABELS][tag_name]
5 changes: 5 additions & 0 deletions kube_controller_manager/hatch.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ base-package-features = [
"kube",
]

[envs.default]
dependencies = [
"requests-mock==1.4.0",
]

[[envs.default.matrix]]
python = ["2.7", "3.9"]

Expand Down
2 changes: 2 additions & 0 deletions kube_controller_manager/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,5 @@ kube_controller_manager.queue.work_unfinished_duration,gauge,,second,,"How many
kube_controller_manager.queue.queue_duration.count,gauge,,,,"How long item stays in a queue before being requested, by queue",0,kubernetes_controller_manager,queue.duration.count,
kube_controller_manager.queue.queue_duration.sum,gauge,,second,,"Total time of items stays in a queue before being requested, by queue",-1,kubernetes_controller_manager,queue.duration.sum,
kube_controller_manager.job_controller.terminated_pods_tracking_finalizer,count,,,,"Used to monitor whether the job controller is removing Pod finalizers from terminated Pods after accounting them in Job status",0,kubernetes_controller_manager,job_controller.terminated_pods_tracking_finalizer,
kube_controller_manager.slis.kubernetes_healthcheck,gauge,,,,"Result of a single controller manager healthcheck (alpha; requires k8s v1.26+)",0,kube_controller_manager,slis.kubernetes_healthcheck,
kube_controller_manager.slis.kubernetes_healthcheck_total,count,,,,"Cumulative results of all controller manager healthchecks (alpha; requires k8s v1.26+)",0,kube_controller_manager,slis.kubernetes_healthcheck_total,
7 changes: 7 additions & 0 deletions kube_controller_manager/tests/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# (C) Datadog, Inc. 2023-present
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)

from datadog_checks.dev import get_here

HERE = get_here()
3 changes: 1 addition & 2 deletions kube_controller_manager/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
import pytest

INSTANCE = {
'prometheus_url': 'http://localhost:10055/metrics',
'tags': ['custom:tag'],
'prometheus_url': 'http://localhost:10257/metrics',
}


Expand Down
26 changes: 26 additions & 0 deletions kube_controller_manager/tests/fixtures/metrics_slis_1.27.3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# HELP kubernetes_healthcheck [ALPHA] This metric records the result of a single healthcheck.
# TYPE kubernetes_healthcheck gauge
kubernetes_healthcheck{name="attachdetach",type="healthz"} 1
kubernetes_healthcheck{name="bootstrapsigner",type="healthz"} 1
kubernetes_healthcheck{name="clusterrole-aggregation",type="healthz"} 1
kubernetes_healthcheck{name="cronjob",type="healthz"} 1
kubernetes_healthcheck{name="csrapproving",type="healthz"} 1
kubernetes_healthcheck{name="csrcleaner",type="healthz"} 1
kubernetes_healthcheck{name="csrsigning",type="healthz"} 1
kubernetes_healthcheck{name="daemonset",type="healthz"} 1
kubernetes_healthcheck{name="deployment",type="healthz"} 1
kubernetes_healthcheck{name="disruption",type="healthz"} 1
kubernetes_healthcheck{name="etcd",type="readyz"} 1
# HELP kubernetes_healthchecks_total [ALPHA] This metric records the results of all healthcheck.
# TYPE kubernetes_healthchecks_total counter
kubernetes_healthchecks_total{name="attachdetach",status="success",type="healthz"} 423
kubernetes_healthchecks_total{name="bootstrapsigner",status="success",type="healthz"} 423
kubernetes_healthchecks_total{name="clusterrole-aggregation",status="success",type="healthz"} 423
kubernetes_healthchecks_total{name="cronjob",status="success",type="healthz"} 423
kubernetes_healthchecks_total{name="csrapproving",status="success",type="healthz"} 423
kubernetes_healthchecks_total{name="csrcleaner",status="success",type="healthz"} 423
kubernetes_healthchecks_total{name="csrsigning",status="success",type="healthz"} 423
kubernetes_healthchecks_total{name="daemonset",status="success",type="healthz"} 423
kubernetes_healthchecks_total{name="deployment",status="success",type="healthz"} 423
kubernetes_healthchecks_total{name="disruption",status="success",type="healthz"} 423
kubernetes_healthchecks_total{name="etcd",status="success",type="readyz"} 15
Loading

0 comments on commit dc2bdc8

Please # to comment.