@@ -18,7 +18,6 @@ package main
1818
1919import (
2020 "context"
21- "errors"
2221 "fmt"
2322 "io"
2423 "path/filepath"
@@ -36,23 +35,13 @@ import (
3635
3736 "github.com/sirupsen/logrus"
3837
39- "github.com/NVIDIA/go-nvml/pkg/nvml"
40-
4138 configapi "sigs.k8s.io/dra-driver-nvidia-gpu/api/nvidia.com/resource/v1beta1"
4239 "sigs.k8s.io/dra-driver-nvidia-gpu/pkg/bootid"
4340 "sigs.k8s.io/dra-driver-nvidia-gpu/pkg/featuregates"
4441 "sigs.k8s.io/dra-driver-nvidia-gpu/pkg/flock"
4542 drametrics "sigs.k8s.io/dra-driver-nvidia-gpu/pkg/metrics"
4643)
4744
48- // ErrDeviceEnumerationTimeout is returned by enumerateDevicesWithRetry
49- // when the retry budget is exhausted without discovering any devices.
50- var ErrDeviceEnumerationTimeout = errors .New ("device enumeration timed out before any GPU was discovered" )
51-
52- type deviceEnumerator interface {
53- enumerateAllPossibleDevices () (* PerGPUAllocatableDevices , error )
54- }
55-
5645type OpaqueDeviceConfig struct {
5746 Requests []string
5847 Config runtime.Object
@@ -1333,55 +1322,6 @@ func (s *DeviceState) finalizeAllocatable(perGPU *PerGPUAllocatableDevices) {
13331322 })
13341323}
13351324
1336- // enumerateDevices performs GPU enumeration attempt.
1337- func enumerateDevices (nvdevlib deviceEnumerator , cp * Checkpoint ) (* PerGPUAllocatableDevices , error ) {
1338- perGPU , err := nvdevlib .enumerateAllPossibleDevices ()
1339- if err != nil {
1340- if isTransientNVMLError (err ) {
1341- klog .Infof ("Transient NVML error on enumeration attempt; will retry in background: %v" , err )
1342- return nil , nil
1343- }
1344- return nil , fmt .Errorf ("error enumerating all possible devices: %w" , err )
1345- }
1346- if len (perGPU .allocatablesMap ) == 0 {
1347- klog .Infof ("No GPU devices discovered on enumeration attempt; will retry in background" )
1348- return nil , nil
1349- }
1350- if featuregates .Enabled (featuregates .PassthroughSupport ) {
1351- if hasOrphanVfioDevices (perGPU , cp ) {
1352- klog .Infof ("Orphan vfio devices found on enumeration attempt; will retry in background" )
1353- return nil , nil
1354- }
1355- }
1356- return perGPU , nil
1357- }
1358-
1359- // enumerateDevicesWithRetry retries until at least one device is found, the context is cancelled, or the retry budget is exhausted.
1360- // Transient NVML errors are retried, all other errors propagate immediately.
1361- func enumerateDevicesWithRetry (ctx context.Context , nvdevlib deviceEnumerator , backoff wait.Backoff , cp * Checkpoint ) (* PerGPUAllocatableDevices , error ) {
1362- totalSteps := backoff .Steps
1363- var perGPUAllocatable * PerGPUAllocatableDevices
1364- err := wait .ExponentialBackoffWithContext (ctx , backoff , func (ctx context.Context ) (bool , error ) {
1365- var err error
1366- perGPUAllocatable , err = enumerateDevices (nvdevlib , cp )
1367- if err != nil {
1368- return false , err
1369- }
1370- return perGPUAllocatable != nil , nil
1371- })
1372- switch {
1373- case err == nil :
1374- return perGPUAllocatable , nil
1375- case errors .Is (err , context .Canceled ), errors .Is (err , context .DeadlineExceeded ):
1376- return nil , fmt .Errorf ("context cancelled while waiting for GPU devices: %w" , err )
1377- case wait .Interrupted (err ):
1378- klog .Errorf ("No GPU devices found after %d attempts; failing startup to avoid publishing an empty ResourceSlice" , totalSteps )
1379- return nil , ErrDeviceEnumerationTimeout
1380- default :
1381- return nil , err
1382- }
1383- }
1384-
13851325// warmupCDICache populates the CDI device spec cache for every full GPU in perGPU.
13861326func warmupCDICache (cdi * CDIHandler , perGPU * PerGPUAllocatableDevices ) {
13871327 var fullGPUuuids []string
@@ -1395,57 +1335,3 @@ func warmupCDICache(cdi *CDIHandler, perGPU *PerGPUAllocatableDevices) {
13951335 klog .V (2 ).Infof ("Warming up CDI device spec cache for GPUs %v" , fullGPUuuids )
13961336 cdi .WarmupDevSpecCache (fullGPUuuids )
13971337}
1398-
1399- // isTransientNVMLError reports whether err is an NVML "not ready yet" error expected during early driver init.
1400- // Errors that may indicate real hardware problems are treated as permanent.
1401- // errors.Is walks the %w-wrap chain and matches by == against the bare nvml.Return at the bottom.
1402- func isTransientNVMLError (err error ) bool {
1403- return errors .Is (err , nvml .ERROR_UNINITIALIZED ) || errors .Is (err , nvml .ERROR_DRIVER_NOT_LOADED )
1404- }
1405-
1406- // deviceEnumerationBackoff builds the retry cadence for background GPU enumeration.
1407- func deviceEnumerationBackoff (flags * Flags ) wait.Backoff {
1408- return wait.Backoff {
1409- Duration : 1 * time .Second ,
1410- Factor : 2.0 ,
1411- Jitter : 0.2 ,
1412- Cap : flags .deviceEnumerationRetryMaxInterval ,
1413- Steps : flags .deviceEnumerationRetrySteps ,
1414- }
1415- }
1416-
1417- // hasOrphanVfioDevices returns true when there are vfio devices with a nil parent GPU (nvml not yet initialized)
1418- // that are not covered by a PrepareCompleted checkpoint entry (which means they were legitimately handed to a VM).
1419- func hasOrphanVfioDevices (perGPU * PerGPUAllocatableDevices , cp * Checkpoint ) bool {
1420- // Build set of vfio device names that have a PrepareCompleted checkpoint entry.
1421- prepared := make (map [DeviceName ]struct {})
1422- if cp != nil && cp .V2 != nil {
1423- for _ , claim := range cp .V2 .PreparedClaims {
1424- if claim .CheckpointState != ClaimCheckpointStatePrepareCompleted {
1425- continue
1426- }
1427- for _ , group := range claim .PreparedDevices {
1428- for _ , dev := range group .Devices {
1429- if dev .Type () == VfioDeviceType {
1430- prepared [dev .Vfio .Device .DeviceName ] = struct {}{}
1431- }
1432- }
1433- }
1434- }
1435- }
1436- for _ , devices := range perGPU .allocatablesMap {
1437- for _ , dev := range devices {
1438- if dev .Type () != VfioDeviceType {
1439- continue
1440- }
1441- if dev .Vfio .parent != nil {
1442- continue
1443- }
1444- // Parentless vfio device — check if it was legitimately prepared.
1445- if _ , ok := prepared [dev .CanonicalName ()]; ! ok {
1446- return true
1447- }
1448- }
1449- }
1450- return false
1451- }
0 commit comments