Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

[system-health] Remove monit dependency from system health #91

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions rules/system-health.mk
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ SYSTEM_HEALTH = system_health-1.0-py3-none-any.whl
$(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health
$(SYSTEM_HEALTH)_PYTHON_VERSION = 3
$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY3) $(SONIC_CONFIG_ENGINE_PY3)
$(SYSTEM_HEALTH)_DEBS_DEPENDS = $(LIBSWSSCOMMON) $(PYTHON3_SWSSCOMMON)
SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH)

export system_health_py3_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))"
17 changes: 9 additions & 8 deletions src/system-health/health_checker/manager.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
from . import utils
from .config import Config
from .health_checker import HealthChecker
from .service_checker import ServiceChecker
from .hardware_checker import HardwareChecker
from .user_defined_checker import UserDefinedChecker


class HealthCheckerManager(object):
"""
Manage all system health checkers and system health configuration.
Expand All @@ -10,7 +18,6 @@ def __init__(self):
self._checkers = []
self._state = self.STATE_BOOTING

from .config import Config
self.config = Config()
self.initialize()

Expand All @@ -19,8 +26,6 @@ def initialize(self):
Initialize the manager. Create service checker and hardware checker by default.
:return:
"""
from .service_checker import ServiceChecker
from .hardware_checker import HardwareChecker
self._checkers.append(ServiceChecker())
self._checkers.append(HardwareChecker())

Expand All @@ -31,7 +36,6 @@ def check(self, chassis):
:return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that
contains the status for all objects that was checked.
"""
from .health_checker import HealthChecker
HealthChecker.summary = HealthChecker.STATUS_OK
stats = {}
self.config.load_config()
Expand All @@ -45,7 +49,6 @@ def check(self, chassis):
self._do_check(checker, stats)

if self.config.user_defined_checkers:
from .user_defined_checker import UserDefinedChecker
for udc in self.config.user_defined_checkers:
checker = UserDefinedChecker(udc)
self._do_check(checker, stats)
Expand All @@ -71,7 +74,6 @@ def _do_check(self, checker, stats):
else:
stats[category].update(info)
except Exception as e:
from .health_checker import HealthChecker
error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e))
entry = {str(checker): {
HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK,
Expand All @@ -83,8 +85,7 @@ def _do_check(self, checker, stats):
stats['Internal'].update(entry)

def _is_system_booting(self):
from .utils import get_uptime
uptime = get_uptime()
uptime = utils.get_uptime()
if not self.boot_timeout:
self.boot_timeout = self.config.get_bootup_timeout()
booting = uptime < self.boot_timeout
Expand Down
274 changes: 266 additions & 8 deletions src/system-health/health_checker/service_checker.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,32 @@
import docker
import os
import pickle
import re


import swsssdk
from sonic_py_common import multi_asic
from sonic_py_common.logger import Logger
from .health_checker import HealthChecker
from . import utils

SYSLOG_IDENTIFIER = 'service_checker'
logger = Logger(log_identifier=SYSLOG_IDENTIFIER)


class ServiceChecker(HealthChecker):
"""
Checker that checks critical system service status via monit service.
"""

# Cache file to save critical_process_dict
CRITICAL_PROCESS_CACHE = '/tmp/critical_process_cache'

CRITICAL_PROCESSES_PATH = 'etc/supervisor/critical_processes'

# Command to get merged directory of a container
GET_CONTAINER_FOLDER_CMD = 'docker inspect {} --format "{{{{.GraphDriver.Data.MergedDir}}}}"'

# Command to query the status of monit service.
CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service'

Expand All @@ -21,26 +41,174 @@ class ServiceChecker(HealthChecker):
'Filesystem': 'Accessible',
'Program': 'Status ok'
}

def __init__(self):
HealthChecker.__init__(self)
self.critical_process_dict = {}
# Containers that has invalid critical_processes file
self.bad_containers = set()

self.container_feature_dict = {}

self.need_save_cache = False

self.load_critical_process_cache()

def get_expected_runnning_container_set(self, feature_table):
"""Get a set of containers that are expected to running on SONiC

Args:
feature_table (object): FEATURE table in CONFIG_DB

Returns:
set: A set of container names
"""
containers = set()
container_feature_dict = {}
for feature_name in feature_table.keys():
if feature_table[feature_name]["state"] not in ["disabled", "always_disabled"]:
if multi_asic.is_multi_asic():
if feature_table[feature_name]["has_global_scope"] == "True":
containers.add(feature_name)
container_feature_dict[feature_name] = feature_name
if feature_table[feature_name]["has_per_asic_scope"] == "True":
num_asics = multi_asic.get_num_asics()
for asic_id in range(num_asics):
containers.add(feature_name + str(asic_id))
container_feature_dict[feature_name + str(asic_id)] = feature_name
else:
containers.add(feature_name)
container_feature_dict[feature_name] = feature_name

return containers, container_feature_dict

def get_current_running_container_set(self):
"""Get current running containers, if the running container is not in self.critical_process_dict,
try get the critical process list

Returns:
set: A set of running containers
"""
DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock')
running_containers = set()
ctrs = DOCKER_CLIENT.containers
try:
lst = ctrs.list(filters={"status": "running"})

for ctr in lst:
running_containers.add(ctr.name)
if ctr.name not in self.critical_process_dict:
self.get_critical_process_by_container(ctr.name)
except docker.errors.APIError as err:
logger.log_debug("Failed to retrieve the running container list. Error: '{}'".format(err))
pass
return running_containers

def get_critical_process_list_from_file(self, container, critical_processes_file):
"""
@summary: Read the critical processes from CRITICAL_PROCESSES_FILE.
@return: A list which contain critical processes.
"""
critical_process_list = []

with open(critical_processes_file, 'r') as file:
for line in file:
# ignore blank lines
if re.match(r"^\s*$", line):
continue
line_info = line.strip(' \n').split(':')
if len(line_info) != 2:
# Invalid syntax in critical_processes file, save it as an error
self.bad_containers.add(container)
logger.log_error('Invalid syntax in critical_processes file of {}'.format(container))
continue

identifier_key = line_info[0].strip()
identifier_value = line_info[1].strip()
if identifier_key == "program" and identifier_value:
# We only count lines like "program:<name>"
critical_process_list.append(identifier_value)

return critical_process_list

def get_critical_process_by_container(self, container):
"""Get critical process for a given container

Args:
container (str): container name
"""
# Get container volumn folder
container_folder = self._get_container_folder(container)
if not container_folder:
logger.log_debug('Failed to get container folder for {}'.format(container_folder))
return

# If container folder does not exist, the container is probably not up, retry it
if not os.path.exists(container_folder):
logger.log_debug('Container folder does not exist: {}'.format(container_folder))
return

# Get critical_processes file path
critical_processes_file = os.path.join(container_folder, ServiceChecker.CRITICAL_PROCESSES_PATH)
if not os.path.isfile(critical_processes_file):
# Critical process file does not exist, the container has no critical processes.
# This is fine, don't retry.
logger.log_debug('Failed to get critical process file for {}, {} does not exist'.format(container, critical_processes_file))
self._update_critical_process_dict(container, [])
return

# Get critical process list from critical_processes
critical_process_list = self.get_critical_process_list_from_file(container, critical_processes_file)
self._update_critical_process_dict(container, critical_process_list)
return

def _update_critical_process_dict(self, container, critical_process_list):
self.critical_process_dict[container] = critical_process_list
self.need_save_cache = True

def _get_container_folder(self, container):
return utils.run_command(ServiceChecker.GET_CONTAINER_FOLDER_CMD.format(container))

def save_critical_process_cache(self):
"""Save self.critical_process_dict to a cache file
"""
if not self.need_save_cache:
return

self.need_save_cache = True
if not self.critical_process_dict:
# if critical_process_dict is empty, don't save it
return

if os.path.exists(ServiceChecker.CRITICAL_PROCESS_CACHE):
# if cache file exists, remove it
os.remove(ServiceChecker.CRITICAL_PROCESS_CACHE)

with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'wb+') as f:
pickle.dump(self.critical_process_dict, f)

def load_critical_process_cache(self):
if not os.path.isfile(ServiceChecker.CRITICAL_PROCESS_CACHE):
# cache file does not exist
return

with open(ServiceChecker.CRITICAL_PROCESS_CACHE, 'rb') as f:
self.critical_process_dict = pickle.load(f)

def reset(self):
self._info = {}

def get_category(self):
return 'Services'

def check(self, config):
def check_by_monit(self, config):
"""
Check critical system service status. Get and analyze the output of $CHECK_CMD, collect status for system,
process and file system.
et and analyze the output of $CHECK_CMD, collect status for file system or customize checker if any.
:param config: Health checker configuration.
:return:
"""
self.reset()
output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip()
if output != 'active':
output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD)
if not output or output != 'active':
self.set_object_not_ok('Service', 'monit', 'monit service is not running')
return

Expand All @@ -58,7 +226,7 @@ def check(self, config):

for line in lines[2:]:
name = line[0:status_begin].strip()
if config.ignore_services and name in config.ignore_services:
if config and config.ignore_services and name in config.ignore_services:
continue
status = line[status_begin:type_begin].strip()
service_type = line[type_begin:].strip()
Expand All @@ -70,3 +238,93 @@ def check(self, config):
else:
self.set_object_ok(service_type, name)
return

def check_services(self, config):
"""Check status of critical services and critical processes

Args:
config (config.Config): Health checker configuration.
"""
config_db = swsssdk.ConfigDBConnector()
config_db.connect()
feature_table = config_db.get_table("FEATURE")
expected_running_containers, self.container_feature_dict = self.get_expected_runnning_container_set(feature_table)
current_running_containers = self.get_current_running_container_set()
self.save_critical_process_cache()

not_running_containers = expected_running_containers.difference(current_running_containers)
for container in not_running_containers:
self.set_object_not_ok('Service', container, "Container '{}' is not running".format(container))

if not self.critical_process_dict:
# Critical process is empty, not expect
self.set_object_not_ok('Service', 'system', 'no critical process found')
return

for container, critical_process_list in self.critical_process_dict.items():
self.check_process_existence(container, critical_process_list, config, feature_table)

for bad_container in self.bad_containers:
self.set_object_not_ok('Service', bad_container, 'Syntax of critical_processes file is incorrect')

def check(self, config):
"""
Check critical system service status.
:param config: Health checker configuration.
:return:
"""
self.reset()
self.check_by_monit(config)
self.check_services(config)


def _parse_supervisorctl_status(self, process_status):
"""Expected input:
arp_update RUNNING pid 67, uptime 1:03:56
buffermgrd RUNNING pid 81, uptime 1:03:56

Args:
process_status (list): List of process status
"""
data = {}
for line in process_status:
line = line.strip()
if not line:
continue
items = line.split()
data[items[0].strip()] = items[1].strip()
return data

def check_process_existence(self, container_name, critical_process_list, config, feature_table):
"""
@summary: Check whether the process in the specified container is running or not.
"""
feature_name = self.container_feature_dict[container_name]
if feature_name in feature_table:
# We look into the 'FEATURE' table to verify whether the container is disabled or not.
# If the container is diabled, we exit.
if ("state" in feature_table[feature_name]
and feature_table[feature_name]["state"] not in ["disabled", "always_disabled"]):

# We are using supervisorctl status to check the critical process status. We cannot leverage psutil here because
# it not always possible to get process cmdline in supervisor.conf. E.g, cmdline of orchagent is "/usr/bin/orchagent",
# however, in supervisor.conf it is "/usr/bin/orchagent.sh"
cmd = 'docker exec {} bash -c "supervisorctl status"'.format(container_name)
process_status = utils.run_command(cmd)
if process_status is None:
for process_name in critical_process_list:
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
return

process_status = self._parse_supervisorctl_status(process_status.strip().splitlines())
for process_name in critical_process_list:
if config and config.ignore_services and process_name in config.ignore_services:
continue

# Sometimes process_name is in critical_processes file, but it is not in supervisor.conf, such process will not run in container.
# and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status.
if process_name in process_status:
if process_status[process_name] != 'RUNNING':
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
else:
self.set_object_ok('Process', '{}:{}'.format(container_name, process_name))
2 changes: 1 addition & 1 deletion src/system-health/health_checker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def run_command(command):
"""
try:
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE)
return process.communicate()[0]
return process.communicate()[0].strip()
except Exception:
return None

Expand Down
Loading