From 98e9838c3f4efba8d90d2dde152fca1cd1d7e4ee Mon Sep 17 00:00:00 2001 From: Mohse Morad Date: Thu, 14 Nov 2024 17:44:29 +0200 Subject: [PATCH] Handle discovery failure stuck --- src/robusta/core/discovery/discovery.py | 3 +- tests/discovery/test_discovery.py | 44 +++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 tests/discovery/test_discovery.py diff --git a/src/robusta/core/discovery/discovery.py b/src/robusta/core/discovery/discovery.py index 254aea1dd..efbf8de30 100644 --- a/src/robusta/core/discovery/discovery.py +++ b/src/robusta/core/discovery/discovery.py @@ -169,7 +169,6 @@ def __create_service_info( ) @staticmethod - def create_service_info_from_hikaru(obj: Union[Deployment, DaemonSet, StatefulSet, Pod, ReplicaSet]) -> ServiceInfo: return Discovery.__create_service_info_from_hikaru( obj.metadata, @@ -187,7 +186,7 @@ def create_service_info_from_hikaru(obj: Union[Deployment, DaemonSet, StatefulSe def discovery_process() -> DiscoveryResults: create_monkey_patches() Discovery.stacktrace_thread_active = True - threading.Thread(target=Discovery.stack_dump_on_signal).start() + threading.Thread(target=Discovery.stack_dump_on_signal, daemon=True).start() pods_metadata: List[V1ObjectMeta] = [] node_requests = defaultdict(list) # map between node name, to request of pods running on it active_services: List[ServiceInfo] = [] diff --git a/tests/discovery/test_discovery.py b/tests/discovery/test_discovery.py new file mode 100644 index 000000000..b50317eb5 --- /dev/null +++ b/tests/discovery/test_discovery.py @@ -0,0 +1,44 @@ +import signal +from concurrent.futures import ProcessPoolExecutor +from contextlib import contextmanager +from http import HTTPStatus +from typing import Any, Generator, NoReturn +from unittest.mock import patch + +import kubernetes +import pytest +from kubernetes.client.exceptions import ApiException + +from robusta.core.discovery.discovery import Discovery + + +# pytest-timeout requires pytest>=7, https://github.com/pytest-dev/pytest-timeout/blob/main/setup.cfg +@contextmanager +def time_limit(seconds: int) -> Generator[None, Any, None]: + def signal_handler(_signum: Any, _frame: Any) -> NoReturn: + pytest.fail("Test took to much time...") + + signal.signal(signal.SIGALRM, signal_handler) + signal.alarm(seconds) + try: + yield + finally: + signal.alarm(0) + + +def _patch_worker() -> None: + def _patched(self: Any, **_: Any) -> NoReturn: + raise ApiException(HTTPStatus.INTERNAL_SERVER_ERROR, reason="Internal Server Error") + + kubernetes.client.CoreV1Api.list_node = _patched + + +def test_discovery_recovery_on_failure(): + with time_limit(20): + patched_pool = ProcessPoolExecutor(1, initializer=_patch_worker) + with patch.object(Discovery, "executor", new=patched_pool): + with pytest.raises(ApiException): + Discovery.discover_resources() + + assert patched_pool._shutdown_thread + assert not Discovery.executor._shutdown_thread