Skip to content

Commit f6ebd10

Browse files
committed
Adding SlowRetry on Infeasible Provisioning
1 parent 99fd382 commit f6ebd10

File tree

5 files changed

+281
-181
lines changed

5 files changed

+281
-181
lines changed

controller/controller.go

Lines changed: 100 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,12 @@ import (
3030
"sync"
3131
"time"
3232

33+
"github.com/kubernetes-csi/csi-lib-utils/slowset"
3334
"github.com/prometheus/client_golang/prometheus"
3435
"github.com/prometheus/client_golang/prometheus/promhttp"
3536
"golang.org/x/time/rate"
37+
"google.golang.org/grpc/codes"
38+
"google.golang.org/grpc/status"
3639
v1 "k8s.io/api/core/v1"
3740
storage "k8s.io/api/storage/v1"
3841
storagebeta "k8s.io/api/storage/v1beta1"
@@ -183,6 +186,10 @@ type ProvisionController struct {
183186
volumeStore VolumeStore
184187

185188
volumeNameHook VolumeNameHook
189+
190+
slowSet *slowset.SlowSet
191+
192+
retryIntervalMax time.Duration
186193
}
187194

188195
const (
@@ -216,6 +223,8 @@ const (
216223
DefaultMetricsPath = "/metrics"
217224
// DefaultAddFinalizer is used when option function AddFinalizer is omitted
218225
DefaultAddFinalizer = false
226+
// DefaultRetryIntervalMax is used when option function RetryIntervalMax is omitted
227+
DefaultRetryIntervalMax = 5 * time.Minute
219228
)
220229

221230
var errRuntime = fmt.Errorf("cannot call option functions after controller has Run")
@@ -451,6 +460,18 @@ func RetryPeriod(retryPeriod time.Duration) func(*ProvisionController) error {
451460
}
452461
}
453462

463+
// RetryIntervalMax is the maximum retry interval of failed provisioning or deletion.
464+
// Defaults to 5 minutes.
465+
func RetryIntervalMax(retryIntervalMax time.Duration) func(*ProvisionController) error {
466+
return func(c *ProvisionController) error {
467+
if c.HasRun() {
468+
return errRuntime
469+
}
470+
c.retryIntervalMax = retryIntervalMax
471+
return nil
472+
}
473+
}
474+
454475
// ClaimsInformer sets the informer to use for accessing PersistentVolumeClaims.
455476
// Defaults to using a internal informer.
456477
func ClaimsInformer(informer cache.SharedIndexInformer) func(*ProvisionController) error {
@@ -667,8 +688,11 @@ func NewProvisionController(
667688
hasRun: false,
668689
hasRunLock: &sync.Mutex{},
669690
volumeNameHook: getProvisionedVolumeNameForClaim,
691+
retryIntervalMax: DefaultRetryIntervalMax,
670692
}
671693

694+
controller.slowSet = slowset.NewSlowSet(controller.retryIntervalMax)
695+
672696
for _, option := range options {
673697
err := option(controller)
674698
if err != nil {
@@ -840,6 +864,8 @@ func (ctrl *ProvisionController) Run(ctx context.Context) {
840864
defer ctrl.claimQueue.ShutDown()
841865
defer ctrl.volumeQueue.ShutDown()
842866

867+
go ctrl.slowSet.Run(ctx.Done())
868+
843869
ctrl.hasRunLock.Lock()
844870
ctrl.hasRun = true
845871
ctrl.hasRunLock.Unlock()
@@ -1085,6 +1111,10 @@ func (ctrl *ProvisionController) syncClaim(ctx context.Context, obj interface{})
10851111
return fmt.Errorf("expected claim but got %+v", obj)
10861112
}
10871113

1114+
if err := ctrl.delayProvisioningIfRecentlyInfeasible(claim); err != nil {
1115+
return err
1116+
}
1117+
10881118
should, err := ctrl.shouldProvision(ctx, claim)
10891119
if err != nil {
10901120
ctrl.updateProvisionStats(claim, err, time.Time{})
@@ -1494,7 +1524,20 @@ func (ctrl *ProvisionController) provisionClaimOperation(ctx context.Context, cl
14941524
}
14951525

14961526
ctx2 := klog.NewContext(ctx, logger)
1497-
err = fmt.Errorf("failed to provision volume with StorageClass %q: %v", claimClass, err)
1527+
1528+
if isInfeasibleError(err) {
1529+
logger.V(2).Info("Detected infeasible volume provisioning request",
1530+
"error", err,
1531+
"claim", klog.KObj(claim))
1532+
1533+
ctrl.markForSlowRetry(ctx, claim, err)
1534+
1535+
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, "ProvisioningFailed",
1536+
fmt.Sprintf("Volume provisioning failed with infeasible error. Retries will be delayed. %v", err))
1537+
1538+
return ProvisioningFinished, err
1539+
}
1540+
14981541
return ctrl.provisionVolumeErrorHandling(ctx2, result, err, claim)
14991542
}
15001543

@@ -1519,6 +1562,62 @@ func (ctrl *ProvisionController) provisionClaimOperation(ctx context.Context, cl
15191562
return ProvisioningFinished, nil
15201563
}
15211564

1565+
func (ctrl *ProvisionController) delayProvisioningIfRecentlyInfeasible(claim *v1.PersistentVolumeClaim) error {
1566+
key := string(claim.UID)
1567+
1568+
claimClass := util.GetPersistentVolumeClaimClass(claim)
1569+
currentClass, err := ctrl.getStorageClass(claimClass)
1570+
if err != nil {
1571+
return nil
1572+
}
1573+
1574+
if info, exists := ctrl.slowSet.Get(key); exists {
1575+
if info.StorageClassUID != string(currentClass.UID) {
1576+
ctrl.slowSet.Remove(key)
1577+
return nil
1578+
}
1579+
}
1580+
if delay := ctrl.slowSet.TimeRemaining(key); delay > 0 {
1581+
return util.NewDelayRetryError(fmt.Sprintf("skipping volume provisioning for pvc %s, because provisioning previously failed with infeasible error", key))
1582+
}
1583+
return nil
1584+
}
1585+
1586+
func (ctrl *ProvisionController) markForSlowRetry(ctx context.Context, claim *v1.PersistentVolumeClaim, err error) {
1587+
if isInfeasibleError(err) {
1588+
key := string(claim.UID)
1589+
1590+
claimClass := util.GetPersistentVolumeClaimClass(claim)
1591+
class, err := ctrl.getStorageClass(claimClass)
1592+
if err != nil {
1593+
logger := klog.FromContext(ctx)
1594+
logger.Error(err, "Failed to get StorageClass for delay tracking",
1595+
"PVC", klog.KObj(claim))
1596+
return
1597+
}
1598+
1599+
info := slowset.ObjectData{
1600+
Timestamp: time.Now(),
1601+
StorageClassUID: string(class.UID),
1602+
}
1603+
ctrl.slowSet.Add(key, info)
1604+
}
1605+
}
1606+
1607+
func isInfeasibleError(err error) bool {
1608+
1609+
st, ok := status.FromError(err)
1610+
if !ok {
1611+
return false
1612+
}
1613+
1614+
switch st.Code() {
1615+
case codes.InvalidArgument:
1616+
return true
1617+
}
1618+
return false
1619+
}
1620+
15221621
func (ctrl *ProvisionController) provisionVolumeErrorHandling(ctx context.Context, result ProvisioningState, err error, claim *v1.PersistentVolumeClaim) (ProvisioningState, error) {
15231622
logger := klog.FromContext(ctx)
15241623
ctrl.eventRecorder.Event(claim, v1.EventTypeWarning, "ProvisioningFailed", err.Error())

controller/controller_test.go

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ import (
2626

2727
"github.com/prometheus/client_golang/prometheus"
2828
dto "github.com/prometheus/client_model/go"
29+
"google.golang.org/grpc/codes"
30+
"google.golang.org/grpc/status"
2931
v1 "k8s.io/api/core/v1"
3032
storage "k8s.io/api/storage/v1"
3133
"k8s.io/apimachinery/pkg/api/resource"
@@ -47,6 +49,7 @@ import (
4749
"k8s.io/klog/v2/ktesting"
4850
_ "k8s.io/klog/v2/ktesting/init"
4951
"sigs.k8s.io/sig-storage-lib-external-provisioner/v11/controller/metrics"
52+
"sigs.k8s.io/sig-storage-lib-external-provisioner/v11/util"
5053
)
5154

5255
const (
@@ -1618,6 +1621,107 @@ func TestControllerSharedInformers(t *testing.T) {
16181621
}
16191622
}
16201623

1624+
// TestInfeasibleRetry tests that sidecar doesn't spam plugin upon infeasible error code (e.g. invalid VAC parameter)
1625+
func TestInfeasibleRetry(t *testing.T) {
1626+
basePVC := newClaim("test-claim", "uid-1-1", "class-1", "foo.bar/baz", "", nil)
1627+
storageClass := newStorageClass("class-1", "foo.bar/baz")
1628+
1629+
tests := []struct {
1630+
name string
1631+
pvc *v1.PersistentVolumeClaim
1632+
expectedProvisionCallCount int
1633+
csiProvisionError error
1634+
eventuallyRemoveFromSlowSet bool
1635+
}{
1636+
{
1637+
name: "Should retry non-infeasible error normally",
1638+
pvc: basePVC,
1639+
expectedProvisionCallCount: 2,
1640+
csiProvisionError: status.Errorf(codes.Internal, "fake non-infeasible error"),
1641+
eventuallyRemoveFromSlowSet: false,
1642+
},
1643+
{
1644+
name: "Should NOT retry infeasible error normally",
1645+
pvc: basePVC,
1646+
expectedProvisionCallCount: 1,
1647+
csiProvisionError: status.Errorf(codes.InvalidArgument, "fake infeasible error"),
1648+
eventuallyRemoveFromSlowSet: false,
1649+
},
1650+
{
1651+
name: "Should EVENTUALLY retry infeasible error",
1652+
pvc: basePVC,
1653+
expectedProvisionCallCount: 2,
1654+
csiProvisionError: status.Errorf(codes.InvalidArgument, "fake infeasible error"),
1655+
eventuallyRemoveFromSlowSet: true,
1656+
},
1657+
}
1658+
1659+
for _, test := range tests {
1660+
t.Run(test.name, func(t *testing.T) {
1661+
// Setup
1662+
_, ctx := ktesting.NewTestContext(t)
1663+
1664+
client := fake.NewSimpleClientset(test.pvc, storageClass)
1665+
1666+
provisioner := newTestProvisioner()
1667+
provisioner.returnError = test.csiProvisionError
1668+
1669+
ctrl := newTestProvisionController(ctx, client, "foo.bar/baz", provisioner)
1670+
1671+
if err := ctrl.classes.Add(storageClass); err != nil {
1672+
t.Fatalf("failed to add StorageClass to cache: %v", err)
1673+
}
1674+
1675+
// First attempt at provision
1676+
err := ctrl.syncClaim(ctx, test.pvc)
1677+
if !errors.Is(err, test.csiProvisionError) {
1678+
t.Errorf("expected error %v but got %v", test.csiProvisionError, err)
1679+
}
1680+
1681+
// For infeasible errors, verify the PVC was added to SlowSet
1682+
if status.Code(test.csiProvisionError) == codes.InvalidArgument {
1683+
key := string(test.pvc.UID)
1684+
if !ctrl.slowSet.Contains(key) {
1685+
t.Error("PVC should have been added to SlowSet after infeasible error")
1686+
}
1687+
}
1688+
1689+
// Fake time passing by removing from SlowSet
1690+
if test.eventuallyRemoveFromSlowSet {
1691+
key := string(test.pvc.UID)
1692+
ctrl.slowSet.Remove(key)
1693+
}
1694+
1695+
// Second attempt at provision
1696+
err2 := ctrl.syncClaim(ctx, test.pvc)
1697+
switch test.expectedProvisionCallCount {
1698+
case 1:
1699+
if !util.IsDelayRetryError(err2) {
1700+
t.Errorf("expected delay retry error but got %v", err2)
1701+
}
1702+
case 2:
1703+
if !errors.Is(err2, test.csiProvisionError) {
1704+
t.Errorf("expected error %v but got %v", test.csiProvisionError, err2)
1705+
}
1706+
default:
1707+
t.Errorf("unexpected provision error in second attempt: %v", err)
1708+
}
1709+
1710+
// Count the number of provision calls from the channel
1711+
provisionCount := 0
1712+
for len(provisioner.provisionCalls) > 0 {
1713+
<-provisioner.provisionCalls
1714+
provisionCount++
1715+
}
1716+
1717+
if test.expectedProvisionCallCount != provisionCount {
1718+
t.Errorf("expected %d provision calls, but got %d",
1719+
test.expectedProvisionCallCount, provisionCount)
1720+
}
1721+
})
1722+
}
1723+
}
1724+
16211725
type testMetrics struct {
16221726
provisioned counts
16231727
deleted counts
@@ -1986,11 +2090,15 @@ type provisionParams struct {
19862090
}
19872091

19882092
func newTestProvisioner() *testProvisioner {
1989-
return &testProvisioner{make(chan provisionParams, 16)}
2093+
return &testProvisioner{
2094+
provisionCalls: make(chan provisionParams, 16),
2095+
returnError: nil,
2096+
}
19902097
}
19912098

19922099
type testProvisioner struct {
19932100
provisionCalls chan provisionParams
2101+
returnError error
19942102
}
19952103

19962104
var _ Provisioner = &testProvisioner{}
@@ -2033,6 +2141,10 @@ func (p *testProvisioner) Provision(ctx context.Context, options ProvisionOption
20332141
allowedTopologies: options.StorageClass.AllowedTopologies,
20342142
}
20352143

2144+
if p.returnError != nil {
2145+
return nil, ProvisioningFinished, p.returnError
2146+
}
2147+
20362148
// Sleep to simulate work done by Provision...for long enough that
20372149
// TestMultipleControllers will consistently fail with lock disabled. If
20382150
// Provision happens too fast, the first controller creates the PV too soon

go.mod

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@ module sigs.k8s.io/sig-storage-lib-external-provisioner/v11
33
go 1.24.0
44

55
require (
6+
github.com/kubernetes-csi/csi-lib-utils v0.22.0
67
github.com/miekg/dns v1.1.66
78
github.com/prometheus/client_golang v1.22.0
89
github.com/prometheus/client_model v0.6.2
910
golang.org/x/time v0.11.0
11+
google.golang.org/grpc v1.69.0
1012
k8s.io/api v0.33.1
1113
k8s.io/apimachinery v0.33.1
1214
k8s.io/client-go v0.33.1
@@ -16,33 +18,27 @@ require (
1618
require (
1719
github.com/beorn7/perks v1.0.1 // indirect
1820
github.com/cespare/xxhash/v2 v2.3.0 // indirect
19-
github.com/davecgh/go-spew v1.1.1 // indirect
21+
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
2022
github.com/emicklei/go-restful/v3 v3.12.2 // indirect
21-
github.com/evanphx/json-patch v5.6.0+incompatible // indirect
2223
github.com/fxamacker/cbor/v2 v2.8.0 // indirect
2324
github.com/go-logr/logr v1.4.3 // indirect
2425
github.com/go-openapi/jsonpointer v0.21.1 // indirect
2526
github.com/go-openapi/jsonreference v0.21.0 // indirect
2627
github.com/go-openapi/swag v0.23.1 // indirect
2728
github.com/gogo/protobuf v1.3.2 // indirect
28-
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
29-
github.com/golang/protobuf v1.5.4 // indirect
3029
github.com/google/gnostic-models v0.6.9 // indirect
3130
github.com/google/go-cmp v0.7.0 // indirect
32-
github.com/google/gofuzz v1.2.0 // indirect
3331
github.com/google/uuid v1.6.0 // indirect
3432
github.com/josharian/intern v1.0.0 // indirect
3533
github.com/json-iterator/go v1.1.12 // indirect
3634
github.com/mailru/easyjson v0.9.0 // indirect
37-
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
3835
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
3936
github.com/modern-go/reflect2 v1.0.2 // indirect
4037
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
4138
github.com/pkg/errors v0.9.1 // indirect
4239
github.com/prometheus/common v0.64.0 // indirect
4340
github.com/prometheus/procfs v0.16.1 // indirect
4441
github.com/x448/float16 v0.8.4 // indirect
45-
golang.org/x/crypto v0.38.0 // indirect
4642
golang.org/x/mod v0.24.0 // indirect
4743
golang.org/x/net v0.40.0 // indirect
4844
golang.org/x/oauth2 v0.30.0 // indirect
@@ -51,11 +47,10 @@ require (
5147
golang.org/x/term v0.32.0 // indirect
5248
golang.org/x/text v0.25.0 // indirect
5349
golang.org/x/tools v0.33.0 // indirect
54-
google.golang.org/appengine v1.6.7 // indirect
50+
google.golang.org/genproto/googleapis/rpc v0.0.0-20241216192217-9240e9c98484 // indirect
5551
google.golang.org/protobuf v1.36.6 // indirect
5652
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
5753
gopkg.in/inf.v0 v0.9.1 // indirect
58-
gopkg.in/yaml.v2 v2.4.0 // indirect
5954
gopkg.in/yaml.v3 v3.0.1 // indirect
6055
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
6156
k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 // indirect

0 commit comments

Comments
 (0)