diff --git a/kube_controller_manager/changelog.d/15914.added b/kube_controller_manager/changelog.d/15914.added new file mode 100644 index 0000000000000..b775fda3c4d80 --- /dev/null +++ b/kube_controller_manager/changelog.d/15914.added @@ -0,0 +1 @@ +Add support for kube_controller_manager SLI metrics diff --git a/kube_controller_manager/datadog_checks/kube_controller_manager/kube_controller_manager.py b/kube_controller_manager/datadog_checks/kube_controller_manager/kube_controller_manager.py index 629990b5ef3cb..b8780f79a8398 100644 --- a/kube_controller_manager/datadog_checks/kube_controller_manager/kube_controller_manager.py +++ b/kube_controller_manager/datadog_checks/kube_controller_manager/kube_controller_manager.py @@ -13,13 +13,15 @@ from datadog_checks.base.config import is_affirmative from datadog_checks.base.utils.http import RequestsWrapper +from .sli_metrics import SliMetricsScraperMixin + NEW_1_24_COUNTERS = { # This metric replaces the deprecated node_collector_evictions_number metric as of k8s v1.24+ 'node_collector_evictions_total': 'nodes.evictions', } -class KubeControllerManagerCheck(KubeLeaderElectionMixin, OpenMetricsBaseCheck): +class KubeControllerManagerCheck(KubeLeaderElectionMixin, SliMetricsScraperMixin, OpenMetricsBaseCheck): DEFAULT_METRIC_LIMIT = 0 DEFAULT_IGNORE_DEPRECATED = False @@ -149,6 +151,13 @@ def __init__(self, name, init_config, instances): instance['health_url'] = url + slis_instance = self.create_sli_prometheus_instance(instance) + instance['sli_scraper_config'] = self.get_scraper_config(slis_instance) + if instance.get('slis_available') is None: + instance['slis_available'] = self.detect_sli_endpoint( + self.get_http_handler(instance['sli_scraper_config']), slis_instance.get('prometheus_url') + ) + def check(self, instance): # Get the configuration for this specific instance scraper_config = self.get_scraper_config(instance) @@ -182,6 +191,10 @@ def check(self, instance): self._perform_service_check(instance) + if instance.get('sli_scraper_config') and instance.get('slis_available'): + self.log.debug('Processing kube controller manager SLI metrics') + self.process(instance['sli_scraper_config'], metric_transformers=self.sli_transformers) + def _ignore_deprecated_metric(self, metric, scraper_config): return metric.documentation.startswith("(Deprecated)") diff --git a/kube_controller_manager/datadog_checks/kube_controller_manager/sli_metrics.py b/kube_controller_manager/datadog_checks/kube_controller_manager/sli_metrics.py new file mode 100644 index 0000000000000..dbcfa461f1c64 --- /dev/null +++ b/kube_controller_manager/datadog_checks/kube_controller_manager/sli_metrics.py @@ -0,0 +1,83 @@ +# (C) Datadog, Inc. 2023-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from __future__ import division + +from copy import deepcopy + +from datadog_checks.base.checks.openmetrics import OpenMetricsBaseCheck + +SLI_METRICS_PATH = '/slis' + +SLI_METRICS_MAP = { + 'kubernetes_healthcheck': 'kubernetes_healthcheck', + 'kubernetes_healthchecks_total': 'kubernetes_healthchecks_total', +} + + +class SliMetricsScraperMixin(OpenMetricsBaseCheck): + """ + This class scrapes metrics for the kube controller manager "/metrics/sli" prometheus endpoint and submits them on + behalf of a check. + """ + + def __init__(self, *args, **kwargs): + super(SliMetricsScraperMixin, self).__init__(*args, **kwargs) + self.sli_transformers = { + 'kubernetes_healthcheck': self.sli_metrics_transformer, + 'kubernetes_healthchecks_total': self.sli_metrics_transformer, + } + + def create_sli_prometheus_instance(self, instance): + """ + Create a copy of the instance and set default values. + This is so the base class can create a scraper_config with the proper values. + """ + KUBE_CONTROLLER_MANAGER_SLI_NAMESPACE = "kube_controller_manager.slis" + + sli_instance = deepcopy(instance) + sli_instance.update( + { + 'namespace': KUBE_CONTROLLER_MANAGER_SLI_NAMESPACE, + 'prometheus_url': instance.get('prometheus_url') + SLI_METRICS_PATH, + } + ) + return sli_instance + + def detect_sli_endpoint(self, http_handler, url): + """ + Whether the SLI metrics endpoint is available (k8s 1.26+). + :return: true if the endpoint returns 200, false otherwise. + """ + try: + r = http_handler.get(url, stream=True) + except Exception as e: + self.log.debug("Error querying SLIs endpoint: %s", e) + return False + if r.status_code == 403: + self.log.debug( + "The /metrics/slis endpoint was introduced in Kubernetes v1.26. If you expect to see SLI metrics, \ + please check that your permissions are configured properly." + ) + return r.status_code == 200 + + def sli_metrics_transformer(self, metric, scraper_config): + modified_metric = deepcopy(metric) + modified_metric.samples = [] + + for sample in metric.samples: + metric_type = sample[self.SAMPLE_LABELS]["type"] + if metric_type == "healthz": + self._rename_sli_tag(sample, "sli_name", "name") + self._remove_tag(sample, "type") + modified_metric.samples.append(sample) + else: + self.log.debug("Skipping metric with type `%s`", metric_type) + self.submit_openmetric(SLI_METRICS_MAP[modified_metric.name], modified_metric, scraper_config) + + def _rename_sli_tag(self, sample, new_tag_name, old_tag_name): + sample[self.SAMPLE_LABELS][new_tag_name] = sample[self.SAMPLE_LABELS][old_tag_name] + del sample[self.SAMPLE_LABELS][old_tag_name] + + def _remove_tag(self, sample, tag_name): + del sample[self.SAMPLE_LABELS][tag_name] diff --git a/kube_controller_manager/hatch.toml b/kube_controller_manager/hatch.toml index 2591d1fd29c42..ef1fd80da9e98 100644 --- a/kube_controller_manager/hatch.toml +++ b/kube_controller_manager/hatch.toml @@ -4,6 +4,11 @@ base-package-features = [ "kube", ] +[envs.default] +dependencies = [ + "requests-mock==1.4.0", +] + [[envs.default.matrix]] python = ["2.7", "3.9"] diff --git a/kube_controller_manager/metadata.csv b/kube_controller_manager/metadata.csv index 4d5a40aaef16e..9cacfcd10ede4 100644 --- a/kube_controller_manager/metadata.csv +++ b/kube_controller_manager/metadata.csv @@ -25,3 +25,5 @@ kube_controller_manager.queue.work_unfinished_duration,gauge,,second,,"How many kube_controller_manager.queue.queue_duration.count,gauge,,,,"How long item stays in a queue before being requested, by queue",0,kubernetes_controller_manager,queue.duration.count, kube_controller_manager.queue.queue_duration.sum,gauge,,second,,"Total time of items stays in a queue before being requested, by queue",-1,kubernetes_controller_manager,queue.duration.sum, kube_controller_manager.job_controller.terminated_pods_tracking_finalizer,count,,,,"Used to monitor whether the job controller is removing Pod finalizers from terminated Pods after accounting them in Job status",0,kubernetes_controller_manager,job_controller.terminated_pods_tracking_finalizer, +kube_controller_manager.slis.kubernetes_healthcheck,gauge,,,,"Result of a single controller manager healthcheck (alpha; requires k8s v1.26+)",0,kube_controller_manager,slis.kubernetes_healthcheck, +kube_controller_manager.slis.kubernetes_healthcheck_total,count,,,,"Cumulative results of all controller manager healthchecks (alpha; requires k8s v1.26+)",0,kube_controller_manager,slis.kubernetes_healthcheck_total, diff --git a/kube_controller_manager/tests/common.py b/kube_controller_manager/tests/common.py new file mode 100644 index 0000000000000..75f466741b742 --- /dev/null +++ b/kube_controller_manager/tests/common.py @@ -0,0 +1,7 @@ +# (C) Datadog, Inc. 2023-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +from datadog_checks.dev import get_here + +HERE = get_here() diff --git a/kube_controller_manager/tests/conftest.py b/kube_controller_manager/tests/conftest.py index ad86ea63653b4..5f7425aec5248 100644 --- a/kube_controller_manager/tests/conftest.py +++ b/kube_controller_manager/tests/conftest.py @@ -6,8 +6,7 @@ import pytest INSTANCE = { - 'prometheus_url': 'http://localhost:10055/metrics', - 'tags': ['custom:tag'], + 'prometheus_url': 'http://localhost:10257/metrics', } diff --git a/kube_controller_manager/tests/fixtures/metrics_slis_1.27.3.txt b/kube_controller_manager/tests/fixtures/metrics_slis_1.27.3.txt new file mode 100644 index 0000000000000..948e3191cc36a --- /dev/null +++ b/kube_controller_manager/tests/fixtures/metrics_slis_1.27.3.txt @@ -0,0 +1,26 @@ +# HELP kubernetes_healthcheck [ALPHA] This metric records the result of a single healthcheck. +# TYPE kubernetes_healthcheck gauge +kubernetes_healthcheck{name="attachdetach",type="healthz"} 1 +kubernetes_healthcheck{name="bootstrapsigner",type="healthz"} 1 +kubernetes_healthcheck{name="clusterrole-aggregation",type="healthz"} 1 +kubernetes_healthcheck{name="cronjob",type="healthz"} 1 +kubernetes_healthcheck{name="csrapproving",type="healthz"} 1 +kubernetes_healthcheck{name="csrcleaner",type="healthz"} 1 +kubernetes_healthcheck{name="csrsigning",type="healthz"} 1 +kubernetes_healthcheck{name="daemonset",type="healthz"} 1 +kubernetes_healthcheck{name="deployment",type="healthz"} 1 +kubernetes_healthcheck{name="disruption",type="healthz"} 1 +kubernetes_healthcheck{name="etcd",type="readyz"} 1 +# HELP kubernetes_healthchecks_total [ALPHA] This metric records the results of all healthcheck. +# TYPE kubernetes_healthchecks_total counter +kubernetes_healthchecks_total{name="attachdetach",status="success",type="healthz"} 423 +kubernetes_healthchecks_total{name="bootstrapsigner",status="success",type="healthz"} 423 +kubernetes_healthchecks_total{name="clusterrole-aggregation",status="success",type="healthz"} 423 +kubernetes_healthchecks_total{name="cronjob",status="success",type="healthz"} 423 +kubernetes_healthchecks_total{name="csrapproving",status="success",type="healthz"} 423 +kubernetes_healthchecks_total{name="csrcleaner",status="success",type="healthz"} 423 +kubernetes_healthchecks_total{name="csrsigning",status="success",type="healthz"} 423 +kubernetes_healthchecks_total{name="daemonset",status="success",type="healthz"} 423 +kubernetes_healthchecks_total{name="deployment",status="success",type="healthz"} 423 +kubernetes_healthchecks_total{name="disruption",status="success",type="healthz"} 423 +kubernetes_healthchecks_total{name="etcd",status="success",type="readyz"} 15 diff --git a/kube_controller_manager/tests/test_sli_metrics.py b/kube_controller_manager/tests/test_sli_metrics.py new file mode 100644 index 0000000000000..c3ff350c550ef --- /dev/null +++ b/kube_controller_manager/tests/test_sli_metrics.py @@ -0,0 +1,228 @@ +# (C) Datadog, Inc. 2023-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) + +import os + +import mock +import pytest +import requests_mock + +from datadog_checks.kube_controller_manager import KubeControllerManagerCheck + +from .common import HERE + +# Constants +CHECK_NAME = 'kube_controller_manager' + + +@pytest.fixture() +def mock_metrics(): + f_name = os.path.join(HERE, 'fixtures', 'metrics_slis_1.27.3.txt') + with open(f_name, 'r') as f: + text_data = f.read() + with mock.patch( + 'requests.get', + return_value=mock.MagicMock( + status_code=200, iter_lines=lambda **kwargs: text_data.split("\n"), headers={'Content-Type': "text/plain"} + ), + ): + yield + + +def test_check_metrics_slis(aggregator, mock_metrics, mock_request, instance): + mock_request.get('http://localhost:10257/metrics/slis', status_code=200) + c = KubeControllerManagerCheck(CHECK_NAME, {}, [instance]) + c.check(instance) + + def assert_metric(name, **kwargs): + # Wrapper to keep assertions < 120 chars + aggregator.assert_metric("{}.{}".format(CHECK_NAME, name), **kwargs) + + assert_metric( + 'slis.kubernetes_healthcheck', + value=1, + metric_type=aggregator.GAUGE, + tags=['sli_name:attachdetach'], + ) + assert_metric( + 'slis.kubernetes_healthcheck', + value=1, + metric_type=aggregator.GAUGE, + tags=['sli_name:bootstrapsigner'], + ) + assert_metric( + 'slis.kubernetes_healthcheck', + value=1, + metric_type=aggregator.GAUGE, + tags=['sli_name:clusterrole-aggregation'], + ) + assert_metric('slis.kubernetes_healthcheck', value=1, metric_type=aggregator.GAUGE, tags=['sli_name:cronjob']) + assert_metric( + 'slis.kubernetes_healthcheck', + value=1, + metric_type=aggregator.GAUGE, + tags=['sli_name:csrapproving'], + ) + assert_metric( + 'slis.kubernetes_healthcheck', + value=1, + metric_type=aggregator.GAUGE, + tags=['sli_name:csrcleaner'], + ) + assert_metric( + 'slis.kubernetes_healthcheck', + value=1, + metric_type=aggregator.GAUGE, + tags=['sli_name:csrsigning'], + ) + assert_metric( + 'slis.kubernetes_healthcheck', + value=1, + metric_type=aggregator.GAUGE, + tags=['sli_name:daemonset'], + ) + assert_metric( + 'slis.kubernetes_healthcheck', + value=1, + metric_type=aggregator.GAUGE, + tags=['sli_name:deployment'], + ) + assert_metric( + 'slis.kubernetes_healthcheck', + value=1, + metric_type=aggregator.GAUGE, + tags=['sli_name:disruption'], + ) + assert_metric( + 'slis.kubernetes_healthchecks_total', + metric_type=aggregator.MONOTONIC_COUNT, + value=423, + tags=['sli_name:attachdetach', 'status:success'], + ) + assert_metric( + 'slis.kubernetes_healthchecks_total', + metric_type=aggregator.MONOTONIC_COUNT, + value=423, + tags=['sli_name:bootstrapsigner', 'status:success'], + ) + assert_metric( + 'slis.kubernetes_healthchecks_total', + metric_type=aggregator.MONOTONIC_COUNT, + value=423, + tags=['sli_name:clusterrole-aggregation', 'status:success'], + ) + assert_metric( + 'slis.kubernetes_healthchecks_total', + metric_type=aggregator.MONOTONIC_COUNT, + value=423, + tags=['sli_name:cronjob', 'status:success'], + ) + assert_metric( + 'slis.kubernetes_healthchecks_total', + metric_type=aggregator.MONOTONIC_COUNT, + value=423, + tags=['sli_name:csrapproving', 'status:success'], + ) + assert_metric( + 'slis.kubernetes_healthchecks_total', + metric_type=aggregator.MONOTONIC_COUNT, + value=423, + tags=['sli_name:csrcleaner', 'status:success'], + ) + assert_metric( + 'slis.kubernetes_healthchecks_total', + metric_type=aggregator.MONOTONIC_COUNT, + value=423, + tags=['sli_name:csrsigning', 'status:success'], + ) + assert_metric( + 'slis.kubernetes_healthchecks_total', + metric_type=aggregator.MONOTONIC_COUNT, + value=423, + tags=['sli_name:daemonset', 'status:success'], + ) + assert_metric( + 'slis.kubernetes_healthchecks_total', + metric_type=aggregator.MONOTONIC_COUNT, + value=423, + tags=['sli_name:deployment', 'status:success'], + ) + assert_metric( + 'slis.kubernetes_healthchecks_total', + metric_type=aggregator.MONOTONIC_COUNT, + value=423, + tags=['sli_name:disruption', 'status:success'], + ) + aggregator.assert_all_metrics_covered() + + +def test_check_metrics_slis_transform(aggregator, mock_metrics, mock_request, instance): + mock_request.get('http://localhost:10257/metrics/slis', status_code=200) + c = KubeControllerManagerCheck(CHECK_NAME, {}, [instance]) + c.check(instance) + + def assert_metric(name, **kwargs): + # Wrapper to keep assertions < 120 chars + aggregator.assert_metric("{}.{}".format(CHECK_NAME, name), **kwargs) + + # Check that no metrics with `name` tag come through + assert_metric('slis.kubernetes_healthcheck', count=0, metric_type=aggregator.GAUGE, tags=['name:attachdetach']) + assert_metric( + 'slis.kubernetes_healthchecks_total', + metric_type=aggregator.MONOTONIC_COUNT, + count=0, + tags=['name:attachdetach', 'status:success'], + ) + + +def test_check_metrics_slis_filter_by_type(aggregator, mock_metrics, mock_request, instance): + mock_request.get('http://localhost:10257/metrics/slis', status_code=200) + c = KubeControllerManagerCheck(CHECK_NAME, {}, [instance]) + c.check(instance) + + def assert_metric(name, **kwargs): + # Wrapper to keep assertions < 120 chars + aggregator.assert_metric("{}.{}".format(CHECK_NAME, name), **kwargs) + + # Check that metrics with type other than `healthz` are filtered out + assert_metric( + 'slis.kubernetes_healthcheck', count=0, metric_type=aggregator.GAUGE, tags=['sli_name:etcd', 'type:readyz'] + ) + + assert_metric( + 'slis.kubernetes_healthchecks_total', + metric_type=aggregator.MONOTONIC_COUNT, + count=0, + tags=['sli_name:etcd', 'status:success', 'type:readyz'], + ) + + +@pytest.fixture() +def mock_request(): + with requests_mock.Mocker() as m: + yield m + + +def test_detect_sli_endpoint(mock_metrics, instance): + with mock.patch('requests.get') as mock_request: + mock_request.return_value.status_code = 200 + c = KubeControllerManagerCheck(CHECK_NAME, {}, [instance]) + c.check(instance) + assert instance["slis_available"] is True + + +def test_detect_sli_endpoint_404(mock_metrics, instance): + with mock.patch('requests.get') as mock_request: + mock_request.return_value.status_code = 404 + c = KubeControllerManagerCheck(CHECK_NAME, {}, [instance]) + c.check(instance) + assert instance["slis_available"] is False + + +def test_detect_sli_endpoint_403(mock_metrics, mock_request, instance): + with mock.patch('requests.get') as mock_request: + mock_request.return_value.status_code = 403 + c = KubeControllerManagerCheck(CHECK_NAME, {}, [instance]) + c.check(instance) + assert instance["slis_available"] is False