Skip to content

Commit db47153

Browse files
committed
Move device enumeration helpers to device_enumerator.go
1 parent 3f140cd commit db47153

3 files changed

Lines changed: 142 additions & 114 deletions

File tree

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
/*
2+
Copyright The Kubernetes Authors
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
https://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package main
18+
19+
import (
20+
"context"
21+
"errors"
22+
"fmt"
23+
"time"
24+
25+
"k8s.io/apimachinery/pkg/util/wait"
26+
"k8s.io/klog/v2"
27+
28+
"github.com/NVIDIA/go-nvml/pkg/nvml"
29+
30+
"sigs.k8s.io/dra-driver-nvidia-gpu/pkg/featuregates"
31+
)
32+
33+
// ErrDeviceEnumerationTimeout is returned by enumerateDevicesWithRetry
34+
// when the retry budget is exhausted without discovering any devices.
35+
var ErrDeviceEnumerationTimeout = errors.New("device enumeration timed out before any GPU was discovered")
36+
37+
type deviceEnumerator interface {
38+
enumerateAllPossibleDevices() (*PerGPUAllocatableDevices, error)
39+
}
40+
41+
// enumerateDevices performs GPU enumeration attempt.
42+
func enumerateDevices(nvdevlib deviceEnumerator, cp *Checkpoint) (*PerGPUAllocatableDevices, error) {
43+
perGPU, err := nvdevlib.enumerateAllPossibleDevices()
44+
if err != nil {
45+
if isTransientNVMLError(err) {
46+
klog.Infof("Transient NVML error on enumeration attempt; will retry in background: %v", err)
47+
return nil, nil
48+
}
49+
return nil, fmt.Errorf("error enumerating all possible devices: %w", err)
50+
}
51+
if len(perGPU.allocatablesMap) == 0 {
52+
klog.Infof("No GPU devices discovered on enumeration attempt; will retry in background")
53+
return nil, nil
54+
}
55+
if featuregates.Enabled(featuregates.PassthroughSupport) {
56+
if hasOrphanVfioDevices(perGPU, cp) {
57+
klog.Infof("Orphan vfio devices found on enumeration attempt; will retry in background")
58+
return nil, nil
59+
}
60+
}
61+
return perGPU, nil
62+
}
63+
64+
// enumerateDevicesWithRetry retries until at least one device is found, the context is cancelled, or the retry budget is exhausted.
65+
// Transient NVML errors are retried, all other errors propagate immediately.
66+
func enumerateDevicesWithRetry(ctx context.Context, nvdevlib deviceEnumerator, backoff wait.Backoff, cp *Checkpoint) (*PerGPUAllocatableDevices, error) {
67+
totalSteps := backoff.Steps
68+
var perGPUAllocatable *PerGPUAllocatableDevices
69+
err := wait.ExponentialBackoffWithContext(ctx, backoff, func(ctx context.Context) (bool, error) {
70+
var err error
71+
perGPUAllocatable, err = enumerateDevices(nvdevlib, cp)
72+
if err != nil {
73+
return false, err
74+
}
75+
return perGPUAllocatable != nil, nil
76+
})
77+
switch {
78+
case err == nil:
79+
return perGPUAllocatable, nil
80+
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded):
81+
return nil, fmt.Errorf("context cancelled while waiting for GPU devices: %w", err)
82+
case wait.Interrupted(err):
83+
klog.Errorf("No GPU devices found after %d attempts; failing startup to avoid publishing an empty ResourceSlice", totalSteps)
84+
return nil, ErrDeviceEnumerationTimeout
85+
default:
86+
return nil, err
87+
}
88+
}
89+
90+
// isTransientNVMLError reports whether err is an NVML "not ready yet" error expected during early driver init.
91+
// Errors that may indicate real hardware problems are treated as permanent.
92+
// errors.Is walks the %w-wrap chain and matches by == against the bare nvml.Return at the bottom.
93+
func isTransientNVMLError(err error) bool {
94+
return errors.Is(err, nvml.ERROR_UNINITIALIZED) || errors.Is(err, nvml.ERROR_DRIVER_NOT_LOADED)
95+
}
96+
97+
// deviceEnumerationBackoff builds the retry cadence for background GPU enumeration.
98+
func deviceEnumerationBackoff(flags *Flags) wait.Backoff {
99+
return wait.Backoff{
100+
Duration: 1 * time.Second,
101+
Factor: 2.0,
102+
Jitter: 0.2,
103+
Cap: flags.deviceEnumerationRetryMaxInterval,
104+
Steps: flags.deviceEnumerationRetrySteps,
105+
}
106+
}
107+
108+
// hasOrphanVfioDevices returns true when there are vfio devices with a nil parent GPU (nvml not yet initialized)
109+
// that are not covered by a PrepareCompleted checkpoint entry (which means they were legitimately handed to a VM).
110+
func hasOrphanVfioDevices(perGPU *PerGPUAllocatableDevices, cp *Checkpoint) bool {
111+
// Build set of vfio device names that have a PrepareCompleted checkpoint entry.
112+
prepared := make(map[DeviceName]struct{})
113+
if cp != nil && cp.V2 != nil {
114+
for _, claim := range cp.V2.PreparedClaims {
115+
if claim.CheckpointState != ClaimCheckpointStatePrepareCompleted {
116+
continue
117+
}
118+
for _, group := range claim.PreparedDevices {
119+
for _, dev := range group.Devices {
120+
if dev.Type() == VfioDeviceType {
121+
prepared[dev.Vfio.Device.DeviceName] = struct{}{}
122+
}
123+
}
124+
}
125+
}
126+
}
127+
for _, devices := range perGPU.allocatablesMap {
128+
for _, dev := range devices {
129+
if dev.Type() != VfioDeviceType {
130+
continue
131+
}
132+
if dev.Vfio.parent != nil {
133+
continue
134+
}
135+
// Parentless vfio device — check if it was legitimately prepared.
136+
if _, ok := prepared[dev.CanonicalName()]; !ok {
137+
return true
138+
}
139+
}
140+
}
141+
return false
142+
}
File renamed without changes.

cmd/gpu-kubelet-plugin/device_state.go

Lines changed: 0 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ package main
1818

1919
import (
2020
"context"
21-
"errors"
2221
"fmt"
2322
"io"
2423
"path/filepath"
@@ -36,23 +35,13 @@ import (
3635

3736
"github.com/sirupsen/logrus"
3837

39-
"github.com/NVIDIA/go-nvml/pkg/nvml"
40-
4138
configapi "sigs.k8s.io/dra-driver-nvidia-gpu/api/nvidia.com/resource/v1beta1"
4239
"sigs.k8s.io/dra-driver-nvidia-gpu/pkg/bootid"
4340
"sigs.k8s.io/dra-driver-nvidia-gpu/pkg/featuregates"
4441
"sigs.k8s.io/dra-driver-nvidia-gpu/pkg/flock"
4542
drametrics "sigs.k8s.io/dra-driver-nvidia-gpu/pkg/metrics"
4643
)
4744

48-
// ErrDeviceEnumerationTimeout is returned by enumerateDevicesWithRetry
49-
// when the retry budget is exhausted without discovering any devices.
50-
var ErrDeviceEnumerationTimeout = errors.New("device enumeration timed out before any GPU was discovered")
51-
52-
type deviceEnumerator interface {
53-
enumerateAllPossibleDevices() (*PerGPUAllocatableDevices, error)
54-
}
55-
5645
type OpaqueDeviceConfig struct {
5746
Requests []string
5847
Config runtime.Object
@@ -1333,55 +1322,6 @@ func (s *DeviceState) finalizeAllocatable(perGPU *PerGPUAllocatableDevices) {
13331322
})
13341323
}
13351324

1336-
// enumerateDevices performs GPU enumeration attempt.
1337-
func enumerateDevices(nvdevlib deviceEnumerator, cp *Checkpoint) (*PerGPUAllocatableDevices, error) {
1338-
perGPU, err := nvdevlib.enumerateAllPossibleDevices()
1339-
if err != nil {
1340-
if isTransientNVMLError(err) {
1341-
klog.Infof("Transient NVML error on enumeration attempt; will retry in background: %v", err)
1342-
return nil, nil
1343-
}
1344-
return nil, fmt.Errorf("error enumerating all possible devices: %w", err)
1345-
}
1346-
if len(perGPU.allocatablesMap) == 0 {
1347-
klog.Infof("No GPU devices discovered on enumeration attempt; will retry in background")
1348-
return nil, nil
1349-
}
1350-
if featuregates.Enabled(featuregates.PassthroughSupport) {
1351-
if hasOrphanVfioDevices(perGPU, cp) {
1352-
klog.Infof("Orphan vfio devices found on enumeration attempt; will retry in background")
1353-
return nil, nil
1354-
}
1355-
}
1356-
return perGPU, nil
1357-
}
1358-
1359-
// enumerateDevicesWithRetry retries until at least one device is found, the context is cancelled, or the retry budget is exhausted.
1360-
// Transient NVML errors are retried, all other errors propagate immediately.
1361-
func enumerateDevicesWithRetry(ctx context.Context, nvdevlib deviceEnumerator, backoff wait.Backoff, cp *Checkpoint) (*PerGPUAllocatableDevices, error) {
1362-
totalSteps := backoff.Steps
1363-
var perGPUAllocatable *PerGPUAllocatableDevices
1364-
err := wait.ExponentialBackoffWithContext(ctx, backoff, func(ctx context.Context) (bool, error) {
1365-
var err error
1366-
perGPUAllocatable, err = enumerateDevices(nvdevlib, cp)
1367-
if err != nil {
1368-
return false, err
1369-
}
1370-
return perGPUAllocatable != nil, nil
1371-
})
1372-
switch {
1373-
case err == nil:
1374-
return perGPUAllocatable, nil
1375-
case errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded):
1376-
return nil, fmt.Errorf("context cancelled while waiting for GPU devices: %w", err)
1377-
case wait.Interrupted(err):
1378-
klog.Errorf("No GPU devices found after %d attempts; failing startup to avoid publishing an empty ResourceSlice", totalSteps)
1379-
return nil, ErrDeviceEnumerationTimeout
1380-
default:
1381-
return nil, err
1382-
}
1383-
}
1384-
13851325
// warmupCDICache populates the CDI device spec cache for every full GPU in perGPU.
13861326
func warmupCDICache(cdi *CDIHandler, perGPU *PerGPUAllocatableDevices) {
13871327
var fullGPUuuids []string
@@ -1395,57 +1335,3 @@ func warmupCDICache(cdi *CDIHandler, perGPU *PerGPUAllocatableDevices) {
13951335
klog.V(2).Infof("Warming up CDI device spec cache for GPUs %v", fullGPUuuids)
13961336
cdi.WarmupDevSpecCache(fullGPUuuids)
13971337
}
1398-
1399-
// isTransientNVMLError reports whether err is an NVML "not ready yet" error expected during early driver init.
1400-
// Errors that may indicate real hardware problems are treated as permanent.
1401-
// errors.Is walks the %w-wrap chain and matches by == against the bare nvml.Return at the bottom.
1402-
func isTransientNVMLError(err error) bool {
1403-
return errors.Is(err, nvml.ERROR_UNINITIALIZED) || errors.Is(err, nvml.ERROR_DRIVER_NOT_LOADED)
1404-
}
1405-
1406-
// deviceEnumerationBackoff builds the retry cadence for background GPU enumeration.
1407-
func deviceEnumerationBackoff(flags *Flags) wait.Backoff {
1408-
return wait.Backoff{
1409-
Duration: 1 * time.Second,
1410-
Factor: 2.0,
1411-
Jitter: 0.2,
1412-
Cap: flags.deviceEnumerationRetryMaxInterval,
1413-
Steps: flags.deviceEnumerationRetrySteps,
1414-
}
1415-
}
1416-
1417-
// hasOrphanVfioDevices returns true when there are vfio devices with a nil parent GPU (nvml not yet initialized)
1418-
// that are not covered by a PrepareCompleted checkpoint entry (which means they were legitimately handed to a VM).
1419-
func hasOrphanVfioDevices(perGPU *PerGPUAllocatableDevices, cp *Checkpoint) bool {
1420-
// Build set of vfio device names that have a PrepareCompleted checkpoint entry.
1421-
prepared := make(map[DeviceName]struct{})
1422-
if cp != nil && cp.V2 != nil {
1423-
for _, claim := range cp.V2.PreparedClaims {
1424-
if claim.CheckpointState != ClaimCheckpointStatePrepareCompleted {
1425-
continue
1426-
}
1427-
for _, group := range claim.PreparedDevices {
1428-
for _, dev := range group.Devices {
1429-
if dev.Type() == VfioDeviceType {
1430-
prepared[dev.Vfio.Device.DeviceName] = struct{}{}
1431-
}
1432-
}
1433-
}
1434-
}
1435-
}
1436-
for _, devices := range perGPU.allocatablesMap {
1437-
for _, dev := range devices {
1438-
if dev.Type() != VfioDeviceType {
1439-
continue
1440-
}
1441-
if dev.Vfio.parent != nil {
1442-
continue
1443-
}
1444-
// Parentless vfio device — check if it was legitimately prepared.
1445-
if _, ok := prepared[dev.CanonicalName()]; !ok {
1446-
return true
1447-
}
1448-
}
1449-
}
1450-
return false
1451-
}

0 commit comments

Comments
 (0)