Skip to content

Commit ced27ca

Browse files
authored
feat: Add dcgm diagnostics as a preflight check (#772)
Signed-off-by: Ajay Mishra <ajmishra@nvidia.com>
1 parent 3dba730 commit ced27ca

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+2687
-157
lines changed

.github/workflows/cleanup-untagged-images.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ jobs:
8989
- nvsentinel-janitor
9090
- nvsentinel-fake-dcgm
9191
- nvsentinel/preflight
92-
- nvsentinel/ping
92+
- nvsentinel/preflight-dcgm-diag
9393

9494
steps:
9595
- name: Delete untagged images for ${{ matrix.package }}

.github/workflows/container-build-test.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ jobs:
6565
make_command: 'make -C log-collector docker-build-log-collector'
6666
- component: file-server-cleanup
6767
make_command: 'make -C log-collector docker-build-file-server-cleanup'
68+
# Preflight Checks (Docker-based)
69+
- component: preflight-dcgm-diag
70+
make_command: 'make -C preflight-checks/dcgm-diag docker-build'
6871
# GPU Reset (Docker-based)
6972
- component: gpu-reset
7073
make_command: 'make -C gpu-reset docker-build'
@@ -131,8 +134,6 @@ jobs:
131134
path: .
132135
- module: preflight
133136
path: .
134-
- module: preflight-checks/ping
135-
path: .
136137
steps:
137138
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
138139

.github/workflows/lint-test.yml

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ jobs:
143143
strategy:
144144
matrix:
145145
component:
146-
- ping
146+
- dcgm-diag
147147
steps:
148148
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
149149

@@ -153,15 +153,6 @@ jobs:
153153
- name: Run lint and test
154154
run: make -C preflight-checks/${{ matrix.component }} lint-test
155155

156-
- name: Upload artifacts
157-
uses: ./.github/actions/upload-test-artifacts
158-
with:
159-
component-name: preflight-${{ matrix.component }}
160-
file-paths: |
161-
preflight-checks/${{ matrix.component }}/coverage.xml
162-
preflight-checks/${{ matrix.component }}/coverage.txt
163-
preflight-checks/${{ matrix.component }}/report.xml
164-
165156
modules-lint-test:
166157
if: github.repository == 'nvidia/nvsentinel'
167158
runs-on: linux-amd64-cpu16

.github/workflows/publish.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,9 @@ jobs:
117117
- component: file-server-cleanup
118118
make_command: 'make -C log-collector docker-publish-file-server-cleanup'
119119
container_name: 'nvsentinel/file-server-cleanup'
120+
- component: preflight-dcgm-diag
121+
make_command: 'make -C preflight-checks/dcgm-diag docker-publish'
122+
container_name: 'nvsentinel/preflight-dcgm-diag'
120123
- component: gpu-reset
121124
make_command: 'make -C gpu-reset docker-publish'
122125
container_name: 'nvsentinel/gpu-reset'

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,3 +444,5 @@ tests/scale-tests/FQM_LATENCY_TEST_PLAN.md
444444
tests/scale-tests/CONCURRENT_DRAIN_TEST_PLAN.md
445445
tests/scale-tests/results/*.csv
446446
tests/scale-tests/cmd/fqm-scale-test/results/
447+
preflight-checks/dcgm-diag/dcgm-diag
448+
preflight/preflight

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,8 @@ protos-generate: protos-clean ## Generate protobuf files from .proto sources
302302
$(MAKE) -C api protos-generate
303303
# Generate Python protobuf files for gpu-health-monitor
304304
$(MAKE) -C health-monitors/gpu-health-monitor protos-generate
305+
# Generate Python protobuf files for dcgm-diag preflight check
306+
$(MAKE) -C preflight-checks/dcgm-diag protos-generate
305307

306308
# Check protobuf files
307309
.PHONY: protos-lint

distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/_helpers.tpl

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,44 @@ Selector labels
3737
{{- define "gpu-health-monitor.selectorLabels" -}}
3838
app.kubernetes.io/name: {{ include "gpu-health-monitor.name" . }}
3939
app.kubernetes.io/instance: {{ .Release.Name }}
40+
{{- end }}
41+
42+
{{/*
43+
DCGM service enabled - uses global.dcgm.enabled with fallback to local
44+
*/}}
45+
{{- define "gpu-health-monitor.dcgmEnabled" -}}
46+
{{- if and .Values.global .Values.global.dcgm }}
47+
{{- .Values.global.dcgm.enabled }}
48+
{{- else }}
49+
{{- .Values.dcgm.dcgmK8sServiceEnabled }}
50+
{{- end }}
51+
{{- end }}
52+
53+
{{/*
54+
DCGM service endpoint - uses global.dcgm.service.endpoint with fallback to local
55+
*/}}
56+
{{- define "gpu-health-monitor.dcgmEndpoint" -}}
57+
{{- if and .Values.global .Values.global.dcgm .Values.global.dcgm.service }}
58+
{{- .Values.global.dcgm.service.endpoint | default .Values.dcgm.service.endpoint }}
59+
{{- else }}
60+
{{- .Values.dcgm.service.endpoint }}
61+
{{- end }}
62+
{{- end }}
63+
64+
{{/*
65+
DCGM service port - uses global.dcgm.service.port with fallback to local
66+
*/}}
67+
{{- define "gpu-health-monitor.dcgmPort" -}}
68+
{{- if and .Values.global .Values.global.dcgm .Values.global.dcgm.service }}
69+
{{- .Values.global.dcgm.service.port | default .Values.dcgm.service.port }}
70+
{{- else }}
71+
{{- .Values.dcgm.service.port }}
72+
{{- end }}
73+
{{- end }}
74+
75+
{{/*
76+
DCGM address - combines endpoint and port
77+
*/}}
78+
{{- define "gpu-health-monitor.dcgmAddr" -}}
79+
{{- printf "%s:%v" (include "gpu-health-monitor.dcgmEndpoint" .) (include "gpu-health-monitor.dcgmPort" .) }}
4080
{{- end }}

distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-3.x.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,15 +47,15 @@ spec:
4747
- --dcgm-error-mapping-config-file
4848
- "/etc/dcgmhealth/dcgmerrorsmapping.csv"
4949
- --dcgm-addr
50-
- {{ ternary (printf "%s:%v" .Values.dcgm.service.endpoint .Values.dcgm.service.port) "localhost:5555" .Values.dcgm.dcgmK8sServiceEnabled | quote }}
50+
- {{ ternary (include "gpu-health-monitor.dcgmAddr" .) "localhost:5555" (include "gpu-health-monitor.dcgmEnabled" . | eq "true") | quote }}
5151
- --port
5252
- "{{ .Values.global.metricsPort }}"
5353
- --verbose
5454
- {{ .Values.verbose | quote }}
5555
- --state-file
5656
- "/var/run/statefile"
5757
- --dcgm-k8s-service-enabled
58-
- {{ .Values.dcgm.dcgmK8sServiceEnabled | quote }}
58+
- {{ include "gpu-health-monitor.dcgmEnabled" . | quote }}
5959
- --metadata-path
6060
- {{ .Values.global.metadataPath | quote }}
6161
- --processing-strategy

distros/kubernetes/nvsentinel/charts/gpu-health-monitor/templates/daemonset-dcgm-4.x.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,15 +47,15 @@ spec:
4747
- --dcgm-error-mapping-config-file
4848
- "/etc/dcgmhealth/dcgmerrorsmapping.csv"
4949
- --dcgm-addr
50-
- {{ ternary (printf "%s:%v" .Values.dcgm.service.endpoint .Values.dcgm.service.port) "localhost:5555" .Values.dcgm.dcgmK8sServiceEnabled | quote }}
50+
- {{ ternary (include "gpu-health-monitor.dcgmAddr" .) "localhost:5555" (include "gpu-health-monitor.dcgmEnabled" . | eq "true") | quote }}
5151
- --port
5252
- "{{ .Values.global.metricsPort }}"
5353
- --verbose
5454
- {{ .Values.verbose | quote }}
5555
- --state-file
5656
- "/var/run/statefile"
5757
- --dcgm-k8s-service-enabled
58-
- {{ .Values.dcgm.dcgmK8sServiceEnabled | quote }}
58+
- {{ include "gpu-health-monitor.dcgmEnabled" . | quote }}
5959
- --metadata-path
6060
- {{ .Values.global.metadataPath | quote }}
6161
- --processing-strategy

distros/kubernetes/nvsentinel/charts/preflight/templates/_helpers.tpl

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,57 @@ Certificate DNS names
9999
- {{ include "preflight.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local
100100
{{- end }}
101101

102+
{{/*
103+
DCGM service endpoint - uses global.dcgm.service.endpoint with fallback to local
104+
*/}}
105+
{{- define "preflight.dcgmEndpoint" -}}
106+
{{- if and .Values.global .Values.global.dcgm .Values.global.dcgm.service }}
107+
{{- .Values.global.dcgm.service.endpoint | default .Values.dcgm.service.endpoint }}
108+
{{- else }}
109+
{{- .Values.dcgm.service.endpoint }}
110+
{{- end }}
111+
{{- end }}
112+
113+
{{/*
114+
DCGM service port - uses global.dcgm.service.port with fallback to local
115+
*/}}
116+
{{- define "preflight.dcgmPort" -}}
117+
{{- if and .Values.global .Values.global.dcgm .Values.global.dcgm.service }}
118+
{{- .Values.global.dcgm.service.port | default .Values.dcgm.service.port }}
119+
{{- else }}
120+
{{- .Values.dcgm.service.port }}
121+
{{- end }}
122+
{{- end }}
123+
124+
{{/*
125+
DCGM hostengine address - combines endpoint and port
126+
*/}}
127+
{{- define "preflight.dcgmHostengineAddr" -}}
128+
{{- printf "%s:%v" (include "preflight.dcgmEndpoint" .) (include "preflight.dcgmPort" .) }}
129+
{{- end }}
130+
131+
{{/*
132+
DCGM diagnostic level
133+
*/}}
134+
{{- define "preflight.dcgmDiagLevel" -}}
135+
{{- .Values.dcgm.diagLevel | default 1 }}
136+
{{- end }}
137+
138+
{{/*
139+
Event processing strategy
140+
*/}}
141+
{{- define "preflight.processingStrategy" -}}
142+
{{- .Values.dcgm.processingStrategy | default "EXECUTE_REMEDIATION" }}
143+
{{- end }}
144+
145+
{{/*
146+
Platform connector socket path for health event reporting
147+
Uses global.socketPath with unix:// prefix
148+
*/}}
149+
{{- define "preflight.connectorSocket" -}}
150+
{{- if and .Values.global .Values.global.socketPath }}
151+
{{- printf "unix://%s" .Values.global.socketPath }}
152+
{{- else }}
153+
{{- "unix:///var/run/nvsentinel.sock" }}
154+
{{- end }}
155+
{{- end }}

0 commit comments

Comments
 (0)