From 9822e28e7ef4a86f4146ab0bc7184ec2b5865f39 Mon Sep 17 00:00:00 2001 From: Yushi Homma Date: Mon, 19 Aug 2024 12:12:59 -0700 Subject: [PATCH 1/3] Add scale_to_zero_timeout parameter to HFApi.create/update_inference_endpoint --- src/huggingface_hub/hf_api.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index f1b95f85a0..260e7767be 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -7390,6 +7390,7 @@ def create_inference_endpoint( account_id: Optional[str] = None, min_replica: int = 0, max_replica: int = 1, + scale_to_zero_timeout: int = 15, revision: Optional[str] = None, task: Optional[str] = None, custom_image: Optional[Dict] = None, @@ -7422,6 +7423,8 @@ def create_inference_endpoint( The minimum number of replicas (instances) to keep running for the Inference Endpoint. Defaults to 0. max_replica (`int`, *optional*): The maximum number of replicas (instances) to scale to for the Inference Endpoint. Defaults to 1. + scale_to_zero_timeout (`int`, *optional*): + The duration in minutes before an inactive endpoint is scaled to zero. Defaults to 15. revision (`str`, *optional*): The specific model revision to deploy on the Inference Endpoint (e.g. `"6c0e6080953db56375760c0471a8c5f2929baf11"`). task (`str`, *optional*): @@ -7507,6 +7510,7 @@ def create_inference_endpoint( "scaling": { "maxReplica": max_replica, "minReplica": min_replica, + "scaleToZeroTimeout": scale_to_zero_timeout }, }, "model": { @@ -7590,6 +7594,7 @@ def update_inference_endpoint( instance_type: Optional[str] = None, min_replica: Optional[int] = None, max_replica: Optional[int] = None, + scale_to_zero_timeout: Optional[int] = None, # Model update repository: Optional[str] = None, framework: Optional[str] = None, @@ -7621,6 +7626,8 @@ def update_inference_endpoint( The minimum number of replicas (instances) to keep running for the Inference Endpoint. max_replica (`int`, *optional*): The maximum number of replicas (instances) to scale to for the Inference Endpoint. + scale_to_zero_timeout (`int`, *optional*): + The duration in minutes before an inactive endpoint is scaled to zero. repository (`str`, *optional*): The name of the model repository associated with the Inference Endpoint (e.g. `"gpt2"`). @@ -7648,7 +7655,7 @@ def update_inference_endpoint( namespace = namespace or self._get_namespace(token=token) payload: Dict = {} - if any(value is not None for value in (accelerator, instance_size, instance_type, min_replica, max_replica)): + if any(value is not None for value in (accelerator, instance_size, instance_type, min_replica, max_replica, scale_to_zero_timeout)): payload["compute"] = { "accelerator": accelerator, "instanceSize": instance_size, @@ -7656,6 +7663,7 @@ def update_inference_endpoint( "scaling": { "maxReplica": max_replica, "minReplica": min_replica, + "scaleToZeroTimeout": scale_to_zero_timeout }, } if any(value is not None for value in (repository, framework, revision, task, custom_image)): From 6629926697e2cfbe794d40f092571f66de3e9c9f Mon Sep 17 00:00:00 2001 From: Yushi Homma Date: Mon, 19 Aug 2024 12:22:14 -0700 Subject: [PATCH 2/3] add ending commas for style --- src/huggingface_hub/hf_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 260e7767be..a0ca584773 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -7510,7 +7510,7 @@ def create_inference_endpoint( "scaling": { "maxReplica": max_replica, "minReplica": min_replica, - "scaleToZeroTimeout": scale_to_zero_timeout + "scaleToZeroTimeout": scale_to_zero_timeout, }, }, "model": { @@ -7663,7 +7663,7 @@ def update_inference_endpoint( "scaling": { "maxReplica": max_replica, "minReplica": min_replica, - "scaleToZeroTimeout": scale_to_zero_timeout + "scaleToZeroTimeout": scale_to_zero_timeout, }, } if any(value is not None for value in (repository, framework, revision, task, custom_image)): From 2d9b152bc8cc3099f49cef2a7e12ff1348bfc085 Mon Sep 17 00:00:00 2001 From: Yushi Homma Date: Mon, 19 Aug 2024 12:37:37 -0700 Subject: [PATCH 3/3] add make style changes --- src/huggingface_hub/hf_api.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index a0ca584773..6997bc3023 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -7655,7 +7655,10 @@ def update_inference_endpoint( namespace = namespace or self._get_namespace(token=token) payload: Dict = {} - if any(value is not None for value in (accelerator, instance_size, instance_type, min_replica, max_replica, scale_to_zero_timeout)): + if any( + value is not None + for value in (accelerator, instance_size, instance_type, min_replica, max_replica, scale_to_zero_timeout) + ): payload["compute"] = { "accelerator": accelerator, "instanceSize": instance_size,