Skip to content

Commit

Permalink
implementing heartbeat critical tolerance for unhealthy detection (#29)
Browse files Browse the repository at this point in the history
  • Loading branch information
bigtallcampbell authored Aug 1, 2024
1 parent a5269ae commit 3798beb
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 13 deletions.
11 changes: 11 additions & 0 deletions src/Models/AppConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ public static string _SPACEFX_CONFIG_DIR {

public int HEARTBEAT_PULSE_TIMING_MS { get; set; }
public int HEARTBEAT_RECEIVED_TOLERANCE_MS { get; set; }
public int HEARTBEAT_RECEIVED_CRITICAL_TOLERANCE_MS { get; set; }
public bool RESOURCE_MONITOR_ENABLED { get; set; }
public int RESOURCE_MONITOR_TIMING_MS { get; set; }
public bool RESOURCE_SCAVENGER_ENABLED { get; set; }
Expand All @@ -40,6 +41,16 @@ public APP_CONFIG() {
HEARTBEAT_PULSE_TIMING_MS = 2000;
}

try {
HEARTBEAT_RECEIVED_CRITICAL_TOLERANCE_MS = int.Parse(GetConfigSetting("heartbeatreceivedcriticaltolerancems").Result);
} catch (Exception ex) {
Console.WriteLine("Error retrieving heartbeatreceivedcriticaltolerancems: " + ex.Message);
Console.WriteLine("Setting default value of '60000'");
HEARTBEAT_PULSE_TIMING_MS = 60000;
}



try {
HEARTBEAT_RECEIVED_TOLERANCE_MS = int.Parse(GetConfigSetting("heartbeatreceivedtolerancems").Result);
} catch (Exception ex) {
Expand Down
14 changes: 6 additions & 8 deletions src/Services/HeartbeatService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ public class HeartbeatService : IHostedService, Core.IMonitorableService {
private readonly ILogger<HeartbeatService> _logger;
private readonly TimeSpan HeartBeatPulseTiming;
private readonly TimeSpan _heartBeatHeardTolerance;
private readonly TimeSpan _heartBeatHeardCriticalTolerance;
private readonly Core.Client _client;
private readonly IServiceProvider _serviceProvider;
private readonly IHostApplicationLifetime _appLifetime;
Expand All @@ -19,9 +20,11 @@ public class HeartbeatService : IHostedService, Core.IMonitorableService {
private readonly Core.APP_CONFIG _appConfig;
private readonly DateTime _appStartTime;
public bool IsHealthy() {
if (_heartbeatsHeard.IsEmpty && DateTime.UtcNow > _appStartTime.Add(_heartBeatHeardTolerance * 2)) {
DateTime heartbeatStaleTime = DateTime.UtcNow.Subtract(_heartBeatHeardTolerance);

if (_heartbeatsHeard.Values.Where(p => p.CurrentSystemTime.ToDateTime().ToUniversalTime() >= heartbeatStaleTime).Count() == 0 && DateTime.UtcNow > _appStartTime.Add(_heartBeatHeardCriticalTolerance)) {
// Log a critical error and return a false value to indicate an unhealthy state.
_logger.LogCritical("No heartbeats have been heard in the last {tolerance}. Returning unhealthy. ", _heartBeatHeardTolerance);
_logger.LogCritical($"No heartbeats have been heard in the last {_heartBeatHeardCriticalTolerance}. Returning unhealthy. ");
return false;
}

Expand All @@ -41,6 +44,7 @@ public HeartbeatService(ILogger<HeartbeatService> logger, IServiceProvider servi

HeartBeatPulseTiming = TimeSpan.FromMilliseconds(_appConfig.HEARTBEAT_PULSE_TIMING_MS);
_heartBeatHeardTolerance = TimeSpan.FromMilliseconds(_appConfig.HEARTBEAT_RECEIVED_TOLERANCE_MS);
_heartBeatHeardCriticalTolerance = TimeSpan.FromMilliseconds(_appConfig.HEARTBEAT_RECEIVED_CRITICAL_TOLERANCE_MS);

_logger.LogInformation("Services.{serviceName} Initialized. HeartBeatPulseTiming: {pulseTiming} HeartBeatHeardTolerance: {pulseHeardTolerance} ", nameof(HeartbeatService), HeartBeatPulseTiming, _heartBeatHeardTolerance);

Expand Down Expand Up @@ -142,12 +146,6 @@ internal void RemoveStaleHeartbeatsFromCache() {
// Log successful removal of stale heartbeats.
_logger.LogTrace("All stale heartbeats successfully removed.");

// Check if the cache is empty and the current time exceeds the app start time by the tolerance period.
if (_heartbeatsHeard.IsEmpty && DateTime.UtcNow > _appStartTime.Add(_heartBeatHeardTolerance * 2)) {
// Log a critical error and throw an exception to potentially trigger a service restart.
_logger.LogCritical("No heartbeats have been heard in the last {tolerance}. Triggering an exception to restart the pod.", _heartBeatHeardTolerance);
throw new ApplicationException($"No heartbeats have been heard in the last {_heartBeatHeardTolerance}. Triggering an exception to restart the pod.");
}
} catch (Exception ex) {
// Log any exceptions that occur during the process and rethrow to handle them accordingly.
_logger.LogError(ex, "Exception while removing stale heartbeats from cache.");
Expand Down
5 changes: 0 additions & 5 deletions src/Services/LivenessCheck.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,6 @@ public Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context, Canc
List<IMonitorableService> monitorableServices = _serviceProvider.GetServices<IHostedService>().Where(service => service is IMonitorableService).Cast<IMonitorableService>().ToList();


// // Add
// monitorableServices.Append(_resourceUtilizationMonitor);
// monitorableServices.Append(_heartbeatService);
// monitorableServices.Append(_pluginLoader);

// Check the core services seperately
coreServiceHealthy = _messageReceiver.IsHealthy();
if (!coreServiceHealthy) {
Expand Down

0 comments on commit 3798beb

Please # to comment.