From 37a74242347b7faef95ddb33fb6b142b5b7f516f Mon Sep 17 00:00:00 2001 From: Stephen Salinas Date: Thu, 5 Sep 2019 08:43:41 -0400 Subject: [PATCH] Retry more TASK_LOST cases on deploy --- .../scheduler/SingularityDeployChecker.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityDeployChecker.java b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityDeployChecker.java index 8a81f5f37d..a308293385 100644 --- a/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityDeployChecker.java +++ b/SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityDeployChecker.java @@ -24,6 +24,7 @@ import com.hubspot.baragon.models.BaragonRequestState; import com.hubspot.mesos.JavaUtils; import com.hubspot.singularity.DeployState; +import com.hubspot.singularity.ExtendedTaskState; import com.hubspot.singularity.LoadBalancerRequestType; import com.hubspot.singularity.LoadBalancerRequestType.LoadBalancerRequestId; import com.hubspot.singularity.RequestState; @@ -47,6 +48,7 @@ import com.hubspot.singularity.SingularityRequestWithState; import com.hubspot.singularity.SingularityTask; import com.hubspot.singularity.SingularityTaskCleanup; +import com.hubspot.singularity.SingularityTaskHistoryUpdate; import com.hubspot.singularity.SingularityTaskId; import com.hubspot.singularity.SingularityTaskShellCommandRequestId; import com.hubspot.singularity.SingularityUpdatePendingDeployRequest; @@ -598,7 +600,17 @@ private SingularityDeployResult getDeployResult(final SingularityRequest request private boolean canRetryTasks(Optional deploy, Collection inactiveDeployMatchingTasks) { int maxRetries = deploy.get().getMaxTaskRetries().orElse(configuration.getDefaultDeployMaxTaskRetries()); - return deploy.isPresent() && maxRetries > 0 && inactiveDeployMatchingTasks.size() <= maxRetries; + long matchingInactiveTasks = inactiveDeployMatchingTasks.stream() + .filter((t) -> { + // All TASK_LOSTs that are not resource limit related should be able to be retried + for (SingularityTaskHistoryUpdate historyUpdate : taskManager.getTaskHistoryUpdates(t)) { + if (historyUpdate.getTaskState() == ExtendedTaskState.TASK_LOST && !historyUpdate.getStatusReason().orElse("").startsWith("REASON_CONTAINER")) { + return false; + } + } + return true; + }).count(); + return maxRetries > 0 && matchingInactiveTasks <= maxRetries; } private Set getNewInactiveDeployTasks(SingularityPendingDeploy pendingDeploy, Collection inactiveDeployMatchingTasks) {