-
Notifications
You must be signed in to change notification settings - Fork 180
/
Copy pathvalues.yaml
275 lines (237 loc) · 10.8 KB
/
values.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
image:
repository: nvcr.io/nvidia/k8s/dcgm-exporter
pullPolicy: IfNotPresent
# Image tag defaults to AppVersion, but you can use the tag key
# for the image tag, e.g:
tag: 4.1.1-4.0.4-ubuntu22.04
# Change the following reference to "/etc/dcgm-exporter/default-counters.csv"
# to stop profiling metrics from DCGM
arguments: ["-f", "/etc/dcgm-exporter/default-counters.csv"]
# NOTE: in general, add any command line arguments to arguments above
# and they will be passed through.
# Use "-r", "<HOST>:<PORT>" to connect to an already running hostengine
# Example arguments: ["-r", "host123:5555"]
# Use "-n" to remove the hostname tag from the output.
# Example arguments: ["-n"]
# Use "-d" to specify the devices to monitor. -d must be followed by a string
# in the following format: [f] or [g[:numeric_range][+]][i[:numeric_range]]
# Where a numeric range is something like 0-4 or 0,2,4, etc.
# Example arguments: ["-d", "g+i"] to monitor all GPUs and GPU instances or
# ["-d", "g:0-3"] to monitor GPUs 0-3.
# Use "-m" to specify the namespace and name of a configmap containing
# the watched exporter fields.
# Example arguments: ["-m", "default:exporter-metrics-config-map"]
# Image pull secrets for container images
imagePullSecrets: []
# Overrides the chart's name
nameOverride: ""
# Overrides the chart's computed fullname
fullnameOverride: ""
# Overrides the deployment namespace
namespaceOverride: ""
# Defines the runtime class that will be used by the pod
runtimeClassName: ""
# Defines serviceAccount names for components.
serviceAccount:
# Specifies whether a service account should be created
create: true
# Annotations to add to the service account
annotations: {}
# The name of the service account to use.
# If not set and create is true, a name is generated using the fullname template
name:
rollingUpdate:
# Specifies maximum number of DaemonSet pods that can be unavailable during the update
maxUnavailable: 1
# Specifies maximum number of nodes with an existing available DaemonSet pod that can have an updated DaemonSet pod during during an update
maxSurge: 0
# Labels to be added to dcgm-exporter pods
podLabels: {}
# Annotations to be added to dcgm-exporter pods
podAnnotations: {}
# Using this annotation which is required for prometheus scraping
# prometheus.io/scrape: "true"
# prometheus.io/port: "9400"
# The SecurityContext for the dcgm-exporter pods
podSecurityContext: {}
# fsGroup: 2000
# The SecurityContext for the dcgm-exporter containers
securityContext:
runAsNonRoot: false
runAsUser: 0
capabilities:
add: ["SYS_ADMIN"]
# readOnlyRootFilesystem: true
# Defines the dcgm-exporter service
service:
# When enabled, the helm chart will create service
enable: true
type: ClusterIP
clusterIP: ""
port: 9400
address: ":9400"
# Annotations to add to the service
annotations: {}
# Allows to control pod resources
resources: {}
# limits:
# cpu: 100m
# memory: 128Mi
# requests:
# cpu: 100m
# memory: 128Mi
serviceMonitor:
apiVersion: "monitoring.coreos.com/v1"
enabled: true
interval: 15s
honorLabels: false
additionalLabels: {}
#monitoring: prometheus
relabelings: []
# - sourceLabels: [__meta_kubernetes_pod_node_name]
# separator: ;
# regex: ^(.*)$
# targetLabel: nodename
# replacement: $1
# action: replace
nodeSelector: {}
#node: gpu
tolerations: []
#- operator: Exists
affinity: {}
#nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: nvidia-gpu
# operator: Exists
extraHostVolumes: []
#- name: host-binaries
# hostPath: /opt/bin
extraConfigMapVolumes:
- name: exporter-metrics-volume
configMap:
name: exporter-metrics-config-map
items:
- key: metrics
path: default-counters.csv
extraVolumeMounts:
- name: exporter-metrics-volume
mountPath: /etc/dcgm-exporter/default-counters.csv
subPath: default-counters.csv
extraEnv: []
#- name: EXTRA_VAR
# value: "TheStringValue"
# Path to the kubelet socket for /pod-resources
kubeletPath: "/var/lib/kubelet/pod-resources"
# HTTPS configuration
tlsServerConfig:
# Enable or disable HTTPS configuration
enabled: false
# Use autogenerated self-signed TLS certificates. Not recommended for production environments.
autoGenerated: true
# Existing secret containing your own server key and certificate
existingSecret: ""
# Certificate file name
certFilename: "tls.crt"
# Key file name
keyFilename: "tls.key"
# CA certificate file name
caFilename: "ca.crt"
# Server policy for client authentication. Maps to ClientAuth Policies.
# For more detail on clientAuth options:
# https://golang.org/pkg/crypto/tls/#ClientAuthType
#
# NOTE: If you want to enable client authentication, you need to use
# RequireAndVerifyClientCert. Other values are insecure.
clientAuthType: ""
# TLS Key for HTTPS - ignored if existingSecret is provided
key: ""
# TLS Certificate for HTTPS - ignored if existingSecret is provided
cert: ""
# CA Certificate for HTTPS - ignored if existingSecret is provided
ca: ""
basicAuth:
#Object containing <user>:<passwords> key-value pairs for each user that will have access via basic authentication
users: {}
# Customized list of metrics to emit. Expected to be in the same format (CSV) as the default list.
# Must be the complete list and is not additive. If unset, the default list will take effect.
# customMetrics: |
# Format
# If line starts with a '#' it is considered a comment
# DCGM FIELD, Prometheus metric type, help message
# Clocks
# DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
# DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
# Temperature
# DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
# DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).
# Power
# DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
# DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
# PCIE
# DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML.
# DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML.
# DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
# Utilization (the sample period varies depending on the product)
# DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
# DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
# DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
# DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).
# Errors and violations
# DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us).
# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us).
# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us).
# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
# Memory usage
# DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
# DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
# ECC
# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.
# Retired pages
# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors.
# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors.
# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.
# NVLink
# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries.
# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
# DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes.
# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload.
# VGPU License status
# DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
# Remapped rows
# DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
# DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
# DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed
# DCP metrics
# DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active.
# DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned.
# DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM.
# DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active.
# DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data.
# DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active.
# DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active.
# DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active.
# DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload.
# DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload.