From 9822e28e7ef4a86f4146ab0bc7184ec2b5865f39 Mon Sep 17 00:00:00 2001
From: Yushi Homma <hommayushi3@gmail.com>
Date: Mon, 19 Aug 2024 12:12:59 -0700
Subject: [PATCH 1/3] Add scale_to_zero_timeout parameter to
 HFApi.create/update_inference_endpoint

---
 src/huggingface_hub/hf_api.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py
index f1b95f85a0..260e7767be 100644
--- a/src/huggingface_hub/hf_api.py
+++ b/src/huggingface_hub/hf_api.py
@@ -7390,6 +7390,7 @@ def create_inference_endpoint(
         account_id: Optional[str] = None,
         min_replica: int = 0,
         max_replica: int = 1,
+        scale_to_zero_timeout: int = 15,
         revision: Optional[str] = None,
         task: Optional[str] = None,
         custom_image: Optional[Dict] = None,
@@ -7422,6 +7423,8 @@ def create_inference_endpoint(
                 The minimum number of replicas (instances) to keep running for the Inference Endpoint. Defaults to 0.
             max_replica (`int`, *optional*):
                 The maximum number of replicas (instances) to scale to for the Inference Endpoint. Defaults to 1.
+            scale_to_zero_timeout (`int`, *optional*):
+                The duration in minutes before an inactive endpoint is scaled to zero. Defaults to 15.
             revision (`str`, *optional*):
                 The specific model revision to deploy on the Inference Endpoint (e.g. `"6c0e6080953db56375760c0471a8c5f2929baf11"`).
             task (`str`, *optional*):
@@ -7507,6 +7510,7 @@ def create_inference_endpoint(
                 "scaling": {
                     "maxReplica": max_replica,
                     "minReplica": min_replica,
+                    "scaleToZeroTimeout": scale_to_zero_timeout
                 },
             },
             "model": {
@@ -7590,6 +7594,7 @@ def update_inference_endpoint(
         instance_type: Optional[str] = None,
         min_replica: Optional[int] = None,
         max_replica: Optional[int] = None,
+        scale_to_zero_timeout: Optional[int] = None,
         # Model update
         repository: Optional[str] = None,
         framework: Optional[str] = None,
@@ -7621,6 +7626,8 @@ def update_inference_endpoint(
                 The minimum number of replicas (instances) to keep running for the Inference Endpoint.
             max_replica (`int`, *optional*):
                 The maximum number of replicas (instances) to scale to for the Inference Endpoint.
+            scale_to_zero_timeout (`int`, *optional*):
+                The duration in minutes before an inactive endpoint is scaled to zero.
 
             repository (`str`, *optional*):
                 The name of the model repository associated with the Inference Endpoint (e.g. `"gpt2"`).
@@ -7648,7 +7655,7 @@ def update_inference_endpoint(
         namespace = namespace or self._get_namespace(token=token)
 
         payload: Dict = {}
-        if any(value is not None for value in (accelerator, instance_size, instance_type, min_replica, max_replica)):
+        if any(value is not None for value in (accelerator, instance_size, instance_type, min_replica, max_replica, scale_to_zero_timeout)):
             payload["compute"] = {
                 "accelerator": accelerator,
                 "instanceSize": instance_size,
@@ -7656,6 +7663,7 @@ def update_inference_endpoint(
                 "scaling": {
                     "maxReplica": max_replica,
                     "minReplica": min_replica,
+                    "scaleToZeroTimeout": scale_to_zero_timeout
                 },
             }
         if any(value is not None for value in (repository, framework, revision, task, custom_image)):

From 6629926697e2cfbe794d40f092571f66de3e9c9f Mon Sep 17 00:00:00 2001
From: Yushi Homma <hommayushi3@gmail.com>
Date: Mon, 19 Aug 2024 12:22:14 -0700
Subject: [PATCH 2/3] add ending commas for style

---
 src/huggingface_hub/hf_api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py
index 260e7767be..a0ca584773 100644
--- a/src/huggingface_hub/hf_api.py
+++ b/src/huggingface_hub/hf_api.py
@@ -7510,7 +7510,7 @@ def create_inference_endpoint(
                 "scaling": {
                     "maxReplica": max_replica,
                     "minReplica": min_replica,
-                    "scaleToZeroTimeout": scale_to_zero_timeout
+                    "scaleToZeroTimeout": scale_to_zero_timeout,
                 },
             },
             "model": {
@@ -7663,7 +7663,7 @@ def update_inference_endpoint(
                 "scaling": {
                     "maxReplica": max_replica,
                     "minReplica": min_replica,
-                    "scaleToZeroTimeout": scale_to_zero_timeout
+                    "scaleToZeroTimeout": scale_to_zero_timeout,
                 },
             }
         if any(value is not None for value in (repository, framework, revision, task, custom_image)):

From 2d9b152bc8cc3099f49cef2a7e12ff1348bfc085 Mon Sep 17 00:00:00 2001
From: Yushi Homma <hommayushi3@gmail.com>
Date: Mon, 19 Aug 2024 12:37:37 -0700
Subject: [PATCH 3/3] add make style changes

---
 src/huggingface_hub/hf_api.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py
index a0ca584773..6997bc3023 100644
--- a/src/huggingface_hub/hf_api.py
+++ b/src/huggingface_hub/hf_api.py
@@ -7655,7 +7655,10 @@ def update_inference_endpoint(
         namespace = namespace or self._get_namespace(token=token)
 
         payload: Dict = {}
-        if any(value is not None for value in (accelerator, instance_size, instance_type, min_replica, max_replica, scale_to_zero_timeout)):
+        if any(
+            value is not None
+            for value in (accelerator, instance_size, instance_type, min_replica, max_replica, scale_to_zero_timeout)
+        ):
             payload["compute"] = {
                 "accelerator": accelerator,
                 "instanceSize": instance_size,