Skip to content

Commit 6d44867

Browse files
committed
node: devicemgr: capture latest API changes and upgrade strategy
- Updates to capture latest device API changes. - Capture upgrade and version skew strategy to capture the correct state since device plugin graduation to Beta in Kubernetes v1.10. Signed-off-by: Swati Sehgal <swsehgal@redhat.com>
1 parent 8c507ba commit 6d44867

File tree

1 file changed

+152
-73
lines changed

1 file changed

+152
-73
lines changed

keps/sig-node/3573-device-plugin/README.md

+152-73
Original file line numberDiff line numberDiff line change
@@ -165,9 +165,11 @@ Kubernetes provides to vendors a mechanism called device plugins to:
165165

166166
```go
167167
service DevicePlugin {
168-
// returns a stream of []Device
168+
rpc GetDevicePluginOptions(Empty) returns (DevicePluginOptions) {}
169169
rpc ListAndWatch(Empty) returns (stream ListAndWatchResponse) {}
170170
rpc Allocate(AllocateRequest) returns (AllocateResponse) {}
171+
rpc GetPreferredAllocation(PreferredAllocationRequest) returns (PreferredAllocationResponse) {}
172+
rpc PreStartContainer(PreStartContainerRequest) returns (PreStartContainerResponse) {}
171173
}
172174
```
173175

@@ -306,27 +308,129 @@ service Registration {
306308
rpc Register(RegisterRequest) returns (Empty) {}
307309
}
308310

311+
message DevicePluginOptions {
312+
// Indicates if PreStartContainer call is required before each container start
313+
bool pre_start_required = 1;
314+
// Indicates if GetPreferredAllocation is implemented and available for calling
315+
bool get_preferred_allocation_available = 2;
316+
}
317+
318+
message RegisterRequest {
319+
// Version of the API the Device Plugin was built against
320+
string version = 1;
321+
// Name of the unix socket the device plugin is listening on
322+
// PATH = path.Join(DevicePluginPath, endpoint)
323+
string endpoint = 2;
324+
// Schedulable resource name. As of now it's expected to be a DNS Label
325+
string resource_name = 3;
326+
// Options to be communicated with Device Manager
327+
DevicePluginOptions options = 4;
328+
}
329+
330+
message Empty {
331+
}
332+
309333
// DevicePlugin is the service advertised by Device Plugins
310334
service DevicePlugin {
335+
// GetDevicePluginOptions returns options to be communicated with Device
336+
// Manager
337+
rpc GetDevicePluginOptions(Empty) returns (DevicePluginOptions) {}
338+
311339
// ListAndWatch returns a stream of List of Devices
312340
// Whenever a Device state change or a Device disappears, ListAndWatch
313341
// returns the new list
314342
rpc ListAndWatch(Empty) returns (stream ListAndWatchResponse) {}
315343

344+
// GetPreferredAllocation returns a preferred set of devices to allocate
345+
// from a list of available ones. The resulting preferred allocation is not
346+
// guaranteed to be the allocation ultimately performed by the
347+
// devicemanager. It is only designed to help the devicemanager make a more
348+
// informed allocation decision when possible.
349+
rpc GetPreferredAllocation(PreferredAllocationRequest) returns (PreferredAllocationResponse) {}
350+
316351
// Allocate is called during container creation so that the Device
317352
// Plugin can run device specific operations and instruct Kubelet
318353
// of the steps to make the Device available in the container
319354
rpc Allocate(AllocateRequest) returns (AllocateResponse) {}
355+
356+
// PreStartContainer is called, if indicated by Device Plugin during registeration phase,
357+
// before each container start. Device plugin can run device specific operations
358+
// such as resetting the device before making devices available to the container
359+
rpc PreStartContainer(PreStartContainerRequest) returns (PreStartContainerResponse) {}
320360
}
321361

322-
message RegisterRequest {
323-
// Version of the API the Device Plugin was built against
324-
string version = 1;
325-
// Name of the unix socket the device plugin is listening on
326-
// PATH = path.Join(DevicePluginPath, endpoint)
327-
string endpoint = 2;
328-
// Schedulable resource name
329-
string resource_name = 3;
362+
// ListAndWatch returns a stream of List of Devices
363+
// Whenever a Device state change or a Device disappears, ListAndWatch
364+
// returns the new list
365+
message ListAndWatchResponse {
366+
repeated Device devices = 1;
367+
}
368+
369+
message TopologyInfo {
370+
repeated NUMANode nodes = 1;
371+
}
372+
373+
message NUMANode {
374+
int64 ID = 1;
375+
}
376+
377+
/* E.g:
378+
* struct Device {
379+
* ID: "GPU-fef8089b-4820-abfc-e83e-94318197576e",
380+
* Health: "Healthy",
381+
* Topology:
382+
* Node:
383+
* ID: 1
384+
*} */
385+
message Device {
386+
// A unique ID assigned by the device plugin used
387+
// to identify devices during the communication
388+
// Max length of this field is 63 characters
389+
string ID = 1;
390+
// Health of the device, can be healthy or unhealthy, see constants.go
391+
string health = 2;
392+
// Topology for device
393+
TopologyInfo topology = 3;
394+
}
395+
396+
// - PreStartContainer is expected to be called before each container start if indicated by plugin during registration phase.
397+
// - PreStartContainer allows kubelet to pass reinitialized devices to containers.
398+
// - PreStartContainer allows Device Plugin to run device specific operations on
399+
// the Devices requested
400+
message PreStartContainerRequest {
401+
repeated string devices_ids = 1 [(gogoproto.customname) = "DevicesIDs"];
402+
}
403+
404+
// PreStartContainerResponse will be send by plugin in response to PreStartContainerRequest
405+
message PreStartContainerResponse {
406+
}
407+
408+
// PreferredAllocationRequest is passed via a call to GetPreferredAllocation()
409+
// at pod admission time. The device plugin should take the list of
410+
// `available_deviceIDs` and calculate a preferred allocation of size
411+
// 'allocation_size' from them, making sure to include the set of devices
412+
// listed in 'must_include_deviceIDs'.
413+
message PreferredAllocationRequest {
414+
repeated ContainerPreferredAllocationRequest container_requests = 1;
415+
}
416+
417+
message ContainerPreferredAllocationRequest {
418+
// List of available deviceIDs from which to choose a preferred allocation
419+
repeated string available_deviceIDs = 1;
420+
// List of deviceIDs that must be included in the preferred allocation
421+
repeated string must_include_deviceIDs = 2;
422+
// Number of devices to include in the preferred allocation
423+
int32 allocation_size = 3;
424+
}
425+
426+
// PreferredAllocationResponse returns a preferred allocation,
427+
// resulting from a PreferredAllocationRequest.
428+
message PreferredAllocationResponse {
429+
repeated ContainerPreferredAllocationResponse container_responses = 1;
430+
}
431+
432+
message ContainerPreferredAllocationResponse {
433+
repeated string deviceIDs = 1;
330434
}
331435

332436
// - Allocate is expected to be called during pod creation since allocation
@@ -336,71 +440,61 @@ message RegisterRequest {
336440
// - Allocate allows Device Plugin to run device specific operations on
337441
// the Devices requested
338442
message AllocateRequest {
339-
repeated string devicesIDs = 1;
443+
repeated ContainerAllocateRequest container_requests = 1;
340444
}
341445

446+
message ContainerAllocateRequest {
447+
repeated string devices_ids = 1 [(gogoproto.customname) = "DevicesIDs"];
448+
}
449+
450+
// AllocateResponse includes the artifacts that needs to be injected into
451+
// a container for accessing 'deviceIDs' that were mentioned as part of
452+
// 'AllocateRequest'.
342453
// Failure Handling:
343454
// if Kubelet sends an allocation request for dev1 and dev2.
344455
// Allocation on dev1 succeeds but allocation on dev2 fails.
345456
// The Device plugin should send a ListAndWatch update and fail the
346457
// Allocation request
347458
message AllocateResponse {
348-
repeated DeviceRuntimeSpec spec = 1;
459+
repeated ContainerAllocateResponse container_responses = 1;
349460
}
350461

351-
// ListAndWatch returns a stream of List of Devices
352-
// Whenever a Device state change or a Device disappears, ListAndWatch
353-
// returns the new list
354-
message ListAndWatchResponse {
355-
repeated Device devices = 1;
356-
}
357-
358-
// The list to be added to the CRI spec
359-
message DeviceRuntimeSpec {
360-
string ID = 1;
361-
362-
// List of environment variable to set in the container.
363-
map<string, string> envs = 2;
462+
message ContainerAllocateResponse {
463+
// List of environment variable to be set in the container to access one of more devices.
464+
map<string, string> envs = 1;
364465
// Mounts for the container.
365-
repeated Mount mounts = 3;
366-
// Devices for the container
367-
repeated DeviceSpec devices = 4;
368-
}
369-
370-
// DeviceSpec specifies a host device to mount into a container.
371-
message DeviceSpec {
372-
// Path of the device within the container.
373-
string container_path = 1;
374-
// Path of the device on the host.
375-
string host_path = 2;
376-
// Cgroups permissions of the device, candidates are one or more of
377-
// * r - allows container to read from the specified device.
378-
// * w - allows container to write to the specified device.
379-
// * m - allows container to create device files that do not yet exist.
380-
string permissions = 3;
466+
repeated Mount mounts = 2;
467+
// Devices for the container.
468+
repeated DeviceSpec devices = 3;
469+
// Container annotations to pass to the container runtime
470+
map<string, string> annotations = 4;
381471
}
382472

383473
// Mount specifies a host volume to mount into a container.
384474
// where device library or tools are installed on host and container
385475
message Mount {
386-
// Path of the mount on the host.
387-
string host_path = 1;
388476
// Path of the mount within the container.
389-
string mount_path = 2;
477+
string container_path = 1;
478+
// Path of the mount on the host.
479+
string host_path = 2;
390480
// If set, the mount is read-only.
391481
bool read_only = 3;
392482
}
393483

394-
// E.g:
395-
// struct Device {
396-
// ID: "GPU-fef8089b-4820-abfc-e83e-94318197576e",
397-
// State: "Healthy",
398-
//}
399-
message Device {
400-
string ID = 2;
401-
string health = 3;
484+
// DeviceSpec specifies a host device to mount into a container.
485+
message DeviceSpec {
486+
// Path of the device within the container.
487+
string container_path = 1;
488+
// Path of the device on the host.
489+
string host_path = 2;
490+
// Cgroups permissions of the device, candidates are one or more of
491+
// * r - allows container to read from the specified device.
492+
// * w - allows container to write to the specified device.
493+
// * m - allows container to create device files that do not yet exist.
494+
string permissions = 3;
402495
}
403496
```
497+
404498
### HealthCheck and Failure Recovery
405499

406500
We want Kubelet as well as the Device Plugins to recover from failures
@@ -575,17 +669,8 @@ protocol and are able to recover from a Kubelet crash.
575669
Then, as long as the Device Plugin API does not change upgrading Kubelet can be done
576670
seamlessly through a Kubelet restart.
577671

578-
*Currently:*
579-
As mentioned in the Versioning section, we currently expect the Device Plugin's
580-
API version to match exactly the Kubelet's Device Plugin API version.
581-
Therefore if the Device Plugin API version change then you will have to change
582-
the Device Plugin too.
583-
584-
*Future:*
585-
When the Device Plugin API becomes a stable feature, versioning should be
586-
backward compatible and even if Kubelet has a different Device Plugin API,
587-
588-
it should not require a Device Plugin upgrade.
672+
Upgrading Kubelet can be done seamlessly through a Kubelet restart and does not
673+
require changes to workflow as the device plugin API is stable.
589674

590675
Refer to the versioning section for versioning scheme compatibility.
591676

@@ -603,16 +688,10 @@ the Device Plugins.
603688

604689
### Version Skew Strategy
605690

606-
Currently we require exact version match between Kubelet and Device Plugin.
607-
API version is expected to be increased only upon incompatible API changes.
608-
609-
Follow protobuf guidelines on versioning:
610-
* Do not change ordering
611-
* Do not remove fields or change types
612-
* Add optional fields
613-
* Introducing new fields with proper default values
614-
* Freeze the package name to `apis/device-plugin/v1alpha1`
615-
* Have kubelet and the Device Plugin negotiate versions if we do break the API
691+
Prior to v1.10, the versioning scheme required the Device Plugin's API version to
692+
match exactly the Kubelet's version. With the graduation of this feature to Beta
693+
and move of device plugin API is to a stable API (version v1beta1) backward
694+
compatibility is supported.
616695

617696
## Production Readiness Review Questionnaire
618697

@@ -809,4 +888,4 @@ In Kubernetes v1.25, [Dynamic Resource Allocation](https://github.com/kubernetes
809888

810889
## Infrastructure Needed (Optional)
811890

812-
Not Applicable.
891+
Not Applicable.

0 commit comments

Comments
 (0)