From b742e79a72a4fc543b7405a660b4481886afbea4 Mon Sep 17 00:00:00 2001 From: William Buchwalter Date: Thu, 3 Aug 2017 23:59:41 -0400 Subject: [PATCH 1/3] manage cordon/uncordon properly --- autoscaler/agent_pool.py | 17 +++++++++++------ autoscaler/cluster.py | 2 +- autoscaler/engine_scaler.py | 10 ++++++---- autoscaler/kube.py | 4 +++- autoscaler/scaler.py | 2 ++ 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/autoscaler/agent_pool.py b/autoscaler/agent_pool.py index 54fa153..4b3b430 100644 --- a/autoscaler/agent_pool.py +++ b/autoscaler/agent_pool.py @@ -4,6 +4,7 @@ from azure.mgmt.compute import ComputeManagementClient import logging import autoscaler.utils as utils +import time from autoscaler.capacity import get_capacity_for_instance_type @@ -32,16 +33,20 @@ def reclaim_unschedulable_nodes(self, new_desired_capacity): Try to get the number of schedulable nodes up if we don't have enough before scaling """ desired_capacity = min(self.max_size, new_desired_capacity) - num_unschedulable = len(self.unschedulable_nodes) - num_schedulable = self.actual_capacity - num_unschedulable - - if num_schedulable < desired_capacity: + # because of how acs-engine works (offset etc.), the desired capacity is the number of node existing in the pool + # (even unschedulable) + the number of additional node we need. + + reclaimed = 0 + if (self.actual_capacity + reclaimed) < desired_capacity: for node in self.unschedulable_nodes: if node.uncordon(): - num_schedulable += 1 + # give some time to k8s to assign any pending pod to the newly uncordonned node + time.sleep(10) + reclaimed += 1 # Uncordon only what we need - if num_schedulable == desired_capacity: + if (self.actual_capacity + reclaimed) == desired_capacity: break + return (self.actual_capacity + reclaimed) def has_node_with_index(self, index): for node in self.nodes: diff --git a/autoscaler/cluster.py b/autoscaler/cluster.py index 6dded9d..202000e 100644 --- a/autoscaler/cluster.py +++ b/autoscaler/cluster.py @@ -178,7 +178,7 @@ def get_pending_pods(self, pods, nodes): for pod in pods: fitting = None for node in nodes: - if node.can_fit(pod.resources): + if node.can_fit(pod.resources) and not node.unschedulable: fitting = node break if fitting is None: diff --git a/autoscaler/engine_scaler.py b/autoscaler/engine_scaler.py index e01ee79..14d9276 100644 --- a/autoscaler/engine_scaler.py +++ b/autoscaler/engine_scaler.py @@ -139,7 +139,11 @@ def scale_pools(self, new_pool_sizes): if not self.dry_run: if new_size > pool.actual_capacity: - pool.reclaim_unschedulable_nodes(new_size) + available_size = pool.reclaim_unschedulable_nodes(new_size) + if new_size == available_size: + #we uncordonned enough nodes, no need to scale + return + else: logger.info("[Dry run] Would have scaled pool '{}' to {} agent(s) (currently at {})".format( pool.name, new_size, pool.actual_capacity)) @@ -222,7 +226,7 @@ def maintain(self, pods_to_schedule, running_or_pending_assigned_pods): else: logger.info( '[Dry run] Would have drained and cordoned %s', node) - elif state == ClusterNodeState.IDLE_SCHEDULABLE: + elif state in (ClusterNodeState.IDLE_SCHEDULABLE, ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE): if not self.dry_run: node.cordon() else: @@ -238,8 +242,6 @@ def maintain(self, pods_to_schedule, running_or_pending_assigned_pods): delete_queue.append({'node': node, 'pool': pool}) else: logger.info('[Dry run] Would have scaled in %s', node) - elif state == ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE: - pass else: raise Exception("Unhandled state: {}".format(state)) diff --git a/autoscaler/kube.py b/autoscaler/kube.py index 5f7dfc4..bf12fd6 100644 --- a/autoscaler/kube.py +++ b/autoscaler/kube.py @@ -65,7 +65,9 @@ def is_in_drain_grace_period(self): determines whether the pod is in a grace period for draining this prevents us from draining pods that are too new """ - return not self.start_time or (datetime.datetime.now(self.start_time.tzinfo) - self.start_time) < self._DRAIN_GRACE_PERIOD + #return not self.start_time or (datetime.datetime.now(self.start_time.tzinfo) - self.start_time) < self._DRAIN_GRACE_PERIOD + # disable pod grace_period, as this seems to cause more problem than benefits for most users + return True def is_drainable(self): return self.is_replicated() and not self.is_critical() and not self.is_in_drain_grace_period() diff --git a/autoscaler/scaler.py b/autoscaler/scaler.py index e03d0df..a576a4e 100644 --- a/autoscaler/scaler.py +++ b/autoscaler/scaler.py @@ -179,7 +179,9 @@ def fulfill_pending(self, pods): if num_unaccounted: logger.warn('Failed to scale sufficiently.') self.notifier.notify_failed_to_scale(selectors_hash, pods) + self.scale_pools(new_pool_sizes) + if self.notifier: self.notifier.notify_scale(new_pool_sizes, pods, current_pool_sizes) From 2d0261958af11c9c0e754ab1dd9bff7f229a122a Mon Sep 17 00:00:00 2001 From: William Buchwalter Date: Fri, 4 Aug 2017 00:15:14 -0400 Subject: [PATCH 2/3] add --util-threshold param --- README.md | 1 + autoscaler/cluster.py | 8 +++++--- autoscaler/engine_scaler.py | 5 ++--- autoscaler/scaler.py | 12 ++++++------ main.py | 4 +++- 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 075ae5e..65df529 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,7 @@ $ python main.py [options] - --spare-agents: Number of agent per pool that should always stay up (default is 1) - --acs-deployment: The name of the deployment used to deploy the kubernetes cluster initially - --idle-threshold: Maximum duration (in seconds) an agent can stay idle before being deleted +- --util-threshold: Utilization of a node in percent under which it is considered under utilized and should be cordoned - --over-provision: Number of extra agents to create when scaling up, default to 0. ## Windows Machine Pools diff --git a/autoscaler/cluster.py b/autoscaler/cluster.py index 202000e..0eddb7d 100644 --- a/autoscaler/cluster.py +++ b/autoscaler/cluster.py @@ -30,7 +30,7 @@ logger = logging.getLogger(__name__) class Cluster(object): - def __init__(self, kubeconfig, idle_threshold, spare_agents, + def __init__(self, kubeconfig, idle_threshold, util_threshold, spare_agents, service_principal_app_id, service_principal_secret, service_principal_tenant_id, kubeconfig_private_key, client_private_key, ca_private_key, instance_init_time, resource_group, notifier, ignore_pools, @@ -54,6 +54,7 @@ def __init__(self, kubeconfig, idle_threshold, spare_agents, self.instance_init_time = instance_init_time self.spare_agents = spare_agents self.idle_threshold = idle_threshold + self.util_threshold = util_threshold self.over_provision = over_provision self.scale_up = scale_up self.maintainance = maintainance @@ -113,8 +114,8 @@ def loop(self, debug): return self.loop_logic() except CloudError as e: logger.error(e) - except: - logger.warn("Unexpected error: {}".format(sys.exc_info()[0])) + except BaseException as e: + logger.warn("Unexpected error: {}".format(e)) return False def create_kube_node(self, node): @@ -142,6 +143,7 @@ def loop_logic(self): over_provision=self.over_provision, spare_count=self.spare_agents, idle_threshold=self.idle_threshold, + util_threshold=self.util_threshold, notifier=self.notifier) pods = list(map(KubePod, pykube.Pod.objects(self.api))) diff --git a/autoscaler/engine_scaler.py b/autoscaler/engine_scaler.py index 14d9276..c18c6bb 100644 --- a/autoscaler/engine_scaler.py +++ b/autoscaler/engine_scaler.py @@ -19,17 +19,16 @@ logger = logging.getLogger(__name__) - class EngineScaler(Scaler): def __init__( self, resource_group, nodes, - over_provision, spare_count, idle_threshold, dry_run, + over_provision, spare_count, idle_threshold, util_threshold, dry_run, deployments, arm_template, arm_parameters, ignore_pools, notifier): Scaler.__init__( self, resource_group, nodes, over_provision, - spare_count, idle_threshold, dry_run, deployments, notifier) + spare_count, idle_threshold, util_threshold, dry_run, deployments, notifier) self.arm_parameters = arm_parameters self.arm_template = arm_template diff --git a/autoscaler/scaler.py b/autoscaler/scaler.py index a576a4e..e87c14e 100644 --- a/autoscaler/scaler.py +++ b/autoscaler/scaler.py @@ -30,12 +30,9 @@ class ClusterNodeState(object): class Scaler(object): + # UTIL_THRESHOLD = 0.3 - # the utilization threshold under which to consider a node - # under utilized and drainable - UTIL_THRESHOLD = 0.3 - - def __init__(self, resource_group, nodes, over_provision, spare_count, idle_threshold, dry_run, deployments, notifier): + def __init__(self, resource_group, nodes, over_provision, spare_count, idle_threshold, util_threshold, dry_run, deployments, notifier): self.resource_group_name = resource_group self.over_provision = over_provision self.spare_count = spare_count @@ -43,6 +40,9 @@ def __init__(self, resource_group, nodes, over_provision, spare_count, idle_thre self.dry_run = dry_run self.deployments = deployments self.notifier = notifier + # the utilization threshold under which to consider a node + # under utilized + self.util_threshold = util_threshold / 100 # ACS support up to 100 agents today # TODO: how to handle case where cluster has 0 node? How to get unit @@ -83,7 +83,7 @@ def get_node_state(self, node, node_pods, pods_to_schedule): p.is_drainable() or 'kube-proxy' in p.name)] utilization = sum((p.resources for p in busy_list), KubeResource()) - under_utilized = (self.UTIL_THRESHOLD * + under_utilized = (self.util_threshold * node.capacity - utilization).possible drainable = not undrainable_list diff --git a/main.py b/main.py index d0f8cd3..d2aa9ad 100644 --- a/main.py +++ b/main.py @@ -26,6 +26,7 @@ #How many agents should we keep even if the cluster is not utilized? The autoscaler will currenty break if --spare-agents == 0 @click.option("--spare-agents", default=1, help='number of agent per pool that should always stay up') @click.option("--idle-threshold", default=1800, help='time in seconds an agent can stay idle') +@click.option("--util-threshold", default=30, help='Utilization of a node in percent under which it is considered under utilized and should be cordoned') @click.option("--service-principal-app-id", default=None, envvar='AZURE_SP_APP_ID') @click.option("--service-principal-secret", default=None, envvar='AZURE_SP_SECRET') @click.option("--service-principal-tenant-id", default=None, envvar='AZURE_SP_TENANT_ID') @@ -50,7 +51,7 @@ def main(resource_group, acs_deployment, sleep, kubeconfig, service_principal_app_id, service_principal_secret, kubeconfig_private_key, client_private_key, ca_private_key, - service_principal_tenant_id, spare_agents, idle_threshold, + service_principal_tenant_id, spare_agents, idle_threshold, util_threshold, no_scale, over_provision, no_maintenance, ignore_pools, slack_hook, dry_run, verbose, debug): logger_handler = logging.StreamHandler(sys.stderr) @@ -81,6 +82,7 @@ def main(resource_group, acs_deployment, sleep, kubeconfig, instance_init_time=instance_init_time, spare_agents=spare_agents, idle_threshold=idle_threshold, + util_threshold=util_threshold, resource_group=resource_group, acs_deployment=acs_deployment, service_principal_app_id=service_principal_app_id, From 4a18f5abc431c6a18c6f0b14003f169fb02ef346 Mon Sep 17 00:00:00 2001 From: William Buchwalter Date: Fri, 4 Aug 2017 00:47:58 -0400 Subject: [PATCH 3/3] do not override user cordon --- autoscaler/kube.py | 1 + test/test_cluster.py | 1 + test/utils.py | 1 + 3 files changed, 3 insertions(+) diff --git a/autoscaler/kube.py b/autoscaler/kube.py index bf12fd6..3a627ae 100644 --- a/autoscaler/kube.py +++ b/autoscaler/kube.py @@ -140,6 +140,7 @@ def uncordon(self): try: self.original.reload() self.original.obj['spec']['unschedulable'] = False + self.original.obj['metadata']['labels'][_CORDON_LABEL] = 'false' self.original.update() logger.info("uncordoned %s", self) return True diff --git a/test/test_cluster.py b/test/test_cluster.py index b1aa2bc..afbc650 100644 --- a/test/test_cluster.py +++ b/test/test_cluster.py @@ -39,6 +39,7 @@ def setUp(self): self.cluster = Cluster( kubeconfig='~/.kube/config', idle_threshold=60, + util_threshold=30, spare_agents=1, instance_init_time=60, resource_group='my-rg', diff --git a/test/utils.py b/test/utils.py index 4b35800..5d77097 100644 --- a/test/utils.py +++ b/test/utils.py @@ -17,4 +17,5 @@ def create_scaler(nodes): arm_template=template, ignore_pools='', idle_threshold=0, + util_threshold=30, notifier='') \ No newline at end of file