Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Add scale_to_zero_timeout parameter to HFApi.create/update_inference_endpoint #2463

Merged
merged 3 commits into from
Aug 20, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion src/huggingface_hub/hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7390,6 +7390,7 @@ def create_inference_endpoint(
account_id: Optional[str] = None,
min_replica: int = 0,
max_replica: int = 1,
scale_to_zero_timeout: int = 15,
revision: Optional[str] = None,
task: Optional[str] = None,
custom_image: Optional[Dict] = None,
Expand Down Expand Up @@ -7422,6 +7423,8 @@ def create_inference_endpoint(
The minimum number of replicas (instances) to keep running for the Inference Endpoint. Defaults to 0.
max_replica (`int`, *optional*):
The maximum number of replicas (instances) to scale to for the Inference Endpoint. Defaults to 1.
scale_to_zero_timeout (`int`, *optional*):
The duration in minutes before an inactive endpoint is scaled to zero. Defaults to 15.
revision (`str`, *optional*):
The specific model revision to deploy on the Inference Endpoint (e.g. `"6c0e6080953db56375760c0471a8c5f2929baf11"`).
task (`str`, *optional*):
Expand Down Expand Up @@ -7507,6 +7510,7 @@ def create_inference_endpoint(
"scaling": {
"maxReplica": max_replica,
"minReplica": min_replica,
"scaleToZeroTimeout": scale_to_zero_timeout,
},
},
"model": {
Expand Down Expand Up @@ -7590,6 +7594,7 @@ def update_inference_endpoint(
instance_type: Optional[str] = None,
min_replica: Optional[int] = None,
max_replica: Optional[int] = None,
scale_to_zero_timeout: Optional[int] = None,
# Model update
repository: Optional[str] = None,
framework: Optional[str] = None,
Expand Down Expand Up @@ -7621,6 +7626,8 @@ def update_inference_endpoint(
The minimum number of replicas (instances) to keep running for the Inference Endpoint.
max_replica (`int`, *optional*):
The maximum number of replicas (instances) to scale to for the Inference Endpoint.
scale_to_zero_timeout (`int`, *optional*):
The duration in minutes before an inactive endpoint is scaled to zero.

repository (`str`, *optional*):
The name of the model repository associated with the Inference Endpoint (e.g. `"gpt2"`).
Expand Down Expand Up @@ -7648,14 +7655,18 @@ def update_inference_endpoint(
namespace = namespace or self._get_namespace(token=token)

payload: Dict = {}
if any(value is not None for value in (accelerator, instance_size, instance_type, min_replica, max_replica)):
if any(
value is not None
for value in (accelerator, instance_size, instance_type, min_replica, max_replica, scale_to_zero_timeout)
):
payload["compute"] = {
"accelerator": accelerator,
"instanceSize": instance_size,
"instanceType": instance_type,
"scaling": {
"maxReplica": max_replica,
"minReplica": min_replica,
"scaleToZeroTimeout": scale_to_zero_timeout,
},
}
if any(value is not None for value in (repository, framework, revision, task, custom_image)):
Expand Down
Loading