Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Handle discovery failure stuck #1630

Merged
merged 2 commits into from
Nov 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions src/robusta/core/discovery/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,6 @@ def __create_service_info(
)

@staticmethod

def create_service_info_from_hikaru(obj: Union[Deployment, DaemonSet, StatefulSet, Pod, ReplicaSet]) -> ServiceInfo:
return Discovery.__create_service_info_from_hikaru(
obj.metadata,
Expand All @@ -187,7 +186,7 @@ def create_service_info_from_hikaru(obj: Union[Deployment, DaemonSet, StatefulSe
def discovery_process() -> DiscoveryResults:
create_monkey_patches()
Discovery.stacktrace_thread_active = True
threading.Thread(target=Discovery.stack_dump_on_signal).start()
threading.Thread(target=Discovery.stack_dump_on_signal, daemon=True).start()
pods_metadata: List[V1ObjectMeta] = []
node_requests = defaultdict(list) # map between node name, to request of pods running on it
active_services: List[ServiceInfo] = []
Expand Down
44 changes: 44 additions & 0 deletions tests/discovery/test_discovery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import signal
from concurrent.futures import ProcessPoolExecutor
from contextlib import contextmanager
from http import HTTPStatus
from typing import Any, Generator, NoReturn
from unittest.mock import patch

import kubernetes
import pytest
from kubernetes.client.exceptions import ApiException

from robusta.core.discovery.discovery import Discovery


# pytest-timeout requires pytest>=7, https://github.com/pytest-dev/pytest-timeout/blob/main/setup.cfg
@contextmanager
def time_limit(seconds: int) -> Generator[None, Any, None]:
def signal_handler(_signum: Any, _frame: Any) -> NoReturn:
pytest.fail("Test took to much time...")

signal.signal(signal.SIGALRM, signal_handler)
signal.alarm(seconds)
try:
yield
finally:
signal.alarm(0)


def _patch_worker() -> None:
def _patched(self: Any, **_: Any) -> NoReturn:
raise ApiException(HTTPStatus.INTERNAL_SERVER_ERROR, reason="Internal Server Error")

kubernetes.client.CoreV1Api.list_node = _patched


def test_discovery_recovery_on_failure():
with time_limit(20):
patched_pool = ProcessPoolExecutor(1, initializer=_patch_worker)
with patch.object(Discovery, "executor", new=patched_pool):
with pytest.raises(ApiException):
Discovery.discover_resources()

assert patched_pool._shutdown_thread
assert not Discovery.executor._shutdown_thread
Loading