Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions azure/scope/machinepool.go
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also add a test case to TestMachinePoolScope_NeedsRequeue to teset the new path as well? Something like "should not requeue if an instance VM image does not match the VMSS when SkipMachinePoolModelReconciliation is enabled". Sorry I missed this in the initial review.

Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ import (
"sigs.k8s.io/cluster-api-provider-azure/azure/services/scalesets"
"sigs.k8s.io/cluster-api-provider-azure/azure/services/virtualmachineimages"
infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-azure/feature"
azureutil "sigs.k8s.io/cluster-api-provider-azure/util/azure"
"sigs.k8s.io/cluster-api-provider-azure/util/futures"
"sigs.k8s.io/cluster-api-provider-azure/util/tele"
Expand Down Expand Up @@ -303,16 +304,20 @@ func (m *MachinePoolScope) SetVMSSState(vmssState *azure.VMSS) {
m.vmssState = vmssState
}

// NeedsRequeue return true if any machines are not on the latest model or the VMSS is not in a terminal provisioning
// state.
// NeedsRequeue returns true if the VMSS is not in a terminal provisioning state, desired replicas do not match actual,
// or (when SkipMachinePoolModelReconciliation is disabled) any machines are not on the latest model.
func (m *MachinePoolScope) NeedsRequeue() bool {
state := m.AzureMachinePool.Status.ProvisioningState
if m.vmssState == nil {
return state != nil && infrav1.IsTerminalProvisioningState(*state)
}

if !m.vmssState.HasLatestModelAppliedToAll() {
return true
// Skip requeue for model state when SkipMachinePoolModelReconciliation is enabled.
// This allows instances with stale models to persist until explicitly scaled.
if !feature.Gates.Enabled(feature.SkipMachinePoolModelReconciliation) {
if !m.vmssState.HasLatestModelAppliedToAll() {
return true
}
}

desiredMatchesActual := len(m.vmssState.Instances) == int(m.DesiredReplicas())
Expand Down
32 changes: 29 additions & 3 deletions azure/scope/machinepool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/intstr"
featuregatetesting "k8s.io/component-base/featuregate/testing"
"k8s.io/utils/ptr"
clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2"
v1beta1conditions "sigs.k8s.io/cluster-api/util/deprecated/v1beta1/conditions"
Expand All @@ -44,6 +45,7 @@ import (
"sigs.k8s.io/cluster-api-provider-azure/azure/services/roleassignments"
"sigs.k8s.io/cluster-api-provider-azure/azure/services/scalesets"
infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-azure/feature"
)

func TestMachinePoolScope_Name(t *testing.T) {
Expand Down Expand Up @@ -508,9 +510,10 @@ func TestMachinePoolScope_GetVMImage(t *testing.T) {

func TestMachinePoolScope_NeedsRequeue(t *testing.T) {
cases := []struct {
Name string
Setup func(mp *clusterv1.MachinePool, amp *infrav1exp.AzureMachinePool, vmss *azure.VMSS)
Verify func(g *WithT, requeue bool)
Name string
Setup func(mp *clusterv1.MachinePool, amp *infrav1exp.AzureMachinePool, vmss *azure.VMSS)
SkipModel bool
Verify func(g *WithT, requeue bool)
}{
{
Name: "should requeue if the machine is not in succeeded state",
Expand Down Expand Up @@ -582,10 +585,33 @@ func TestMachinePoolScope_NeedsRequeue(t *testing.T) {
g.Expect(requeue).To(BeTrue())
},
},
{
Name: "should not requeue if an instance VM image does not match the VMSS when SkipMachinePoolModelReconciliation is enabled",
Setup: func(mp *clusterv1.MachinePool, amp *infrav1exp.AzureMachinePool, vmss *azure.VMSS) {
succeeded := infrav1.Succeeded
mp.Spec.Replicas = ptr.To[int32](1)
amp.Status.ProvisioningState = &succeeded
vmss.Instances = []azure.VMSSVM{
{
Name: "instance1",
Image: infrav1.Image{
Marketplace: &infrav1.AzureMarketplaceImage{
Version: "foo1",
},
},
},
}
},
SkipModel: true,
Verify: func(g *WithT, requeue bool) {
g.Expect(requeue).To(BeFalse())
},
},
}

for _, c := range cases {
t.Run(c.Name, func(t *testing.T) {
featuregatetesting.SetFeatureGateDuringTest(t, feature.Gates, feature.SkipMachinePoolModelReconciliation, c.SkipModel)
var (
g = NewWithT(t)
mockCtrl = gomock.NewController(t)
Expand Down
Comment thread
jackfrancis marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (

infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-azure/feature"
"sigs.k8s.io/cluster-api-provider-azure/util/tele"
)

Expand Down Expand Up @@ -118,6 +119,8 @@ func (rollingUpdateStrategy rollingUpdateStrategy) SelectMachinesToDelete(ctx co
return nil, err
}

skipModelReconciliation := feature.Gates.Enabled(feature.SkipMachinePoolModelReconciliation)

var (
order = func() func(machines []infrav1exp.AzureMachinePoolMachine) []infrav1exp.AzureMachinePoolMachine {
switch rollingUpdateStrategy.DeletePolicy {
Expand Down Expand Up @@ -182,14 +185,19 @@ func (rollingUpdateStrategy rollingUpdateStrategy) SelectMachinesToDelete(ctx co
// we have too many machines, let's choose the oldest to remove
if overProvisionCount > 0 {
var toDelete []infrav1exp.AzureMachinePoolMachine
log.Info("over-provisioned", "desiredReplicaCount", desiredReplicaCount, "overProvisionCount", overProvisionCount, "machinesWithoutLatestModel", getProviderIDs(machinesWithoutLatestModel))
// we are over-provisioned try to remove old models
for _, v := range machinesWithoutLatestModel {
if len(toDelete) >= overProvisionCount {
return toDelete, nil
log.Info("over-provisioned", "desiredReplicaCount", desiredReplicaCount, "overProvisionCount", overProvisionCount, "machinesWithoutLatestModel", getProviderIDs(machinesWithoutLatestModel), "skipModelReconciliation", skipModelReconciliation)

// When SkipMachinePoolModelReconciliation is enabled, skip prioritizing machines without latest model.
// Just delete the oldest ready machines to meet the desired replica count.
if !skipModelReconciliation {
// we are over-provisioned try to remove old models
for _, v := range machinesWithoutLatestModel {
if len(toDelete) >= overProvisionCount {
return toDelete, nil
}

toDelete = append(toDelete, v)
}

toDelete = append(toDelete, v)
}

log.Info("over-provisioned ready", "desiredReplicaCount", desiredReplicaCount, "overProvisionCount", overProvisionCount, "readyMachines", getProviderIDs(readyMachines))
Expand All @@ -205,6 +213,11 @@ func (rollingUpdateStrategy rollingUpdateStrategy) SelectMachinesToDelete(ctx co
return toDelete, nil
}

if skipModelReconciliation {
log.Info("skip model reconciliation enabled, not selecting machines based on latest model state")
return []infrav1exp.AzureMachinePoolMachine{}, nil
}

if len(machinesWithoutLatestModel) == 0 {
log.Info("nothing more to do since all the AzureMachinePoolMachine(s) are the latest model and not over-provisioned")
return []infrav1exp.AzureMachinePoolMachine{}, nil
Expand All @@ -216,7 +229,7 @@ func (rollingUpdateStrategy rollingUpdateStrategy) SelectMachinesToDelete(ctx co
}

var toDelete []infrav1exp.AzureMachinePoolMachine
log.Info("removing ready machines within disruption budget", "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "readyMachines", getProviderIDs(readyMachines), "readyMachinesCount", len(readyMachines))
log.Info("removing ready machines within disruption budget", "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "readyMachines", getProviderIDs(readyMachines), "readyMachinesCount", len(readyMachines), "skipModelReconciliation", skipModelReconciliation)
for _, v := range readyMachines {
if len(toDelete) >= disruptionBudget {
return toDelete, nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@ import (
"github.com/onsi/gomega/types"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
featuregatetesting "k8s.io/component-base/featuregate/testing"
clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2"

infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-azure/feature"
"sigs.k8s.io/cluster-api-provider-azure/internal/test/matchers/gomega"
)

Expand Down Expand Up @@ -187,6 +189,7 @@ func TestMachinePoolRollingUpdateStrategy_SelectMachinesToDelete(t *testing.T) {
strategy DeleteSelector
input map[string]infrav1exp.AzureMachinePoolMachine
desiredReplicas int32
skipModel bool
want types.GomegaMatcher
errStr string
}{
Expand Down Expand Up @@ -384,6 +387,32 @@ func TestMachinePoolRollingUpdateStrategy_SelectMachinesToDelete(t *testing.T) {
},
want: HaveLen(1),
},
{
name: "if skip model reconciliation is enabled, do not select stale model machines at steady desired count",
strategy: makeRollingUpdateStrategy(infrav1exp.MachineRollingUpdateDeployment{MaxUnavailable: &one}),
desiredReplicas: 3,
skipModel: true,
input: map[string]infrav1exp.AzureMachinePoolMachine{
"foo": makeAMPM(ampmOptions{Ready: true, LatestModel: true, ProvisioningState: succeeded}),
"bin": makeAMPM(ampmOptions{Ready: true, LatestModel: true, ProvisioningState: succeeded}),
"baz": makeAMPM(ampmOptions{Ready: true, LatestModel: false, ProvisioningState: succeeded}),
},
want: BeEmpty(),
},
{
name: "if over-provisioned and skip model reconciliation is enabled, select by delete policy and not stale model priority",
strategy: makeRollingUpdateStrategy(infrav1exp.MachineRollingUpdateDeployment{DeletePolicy: infrav1exp.OldestDeletePolicyType}),
desiredReplicas: 2,
skipModel: true,
input: map[string]infrav1exp.AzureMachinePoolMachine{
"foo": makeAMPM(ampmOptions{Ready: true, LatestModel: false, ProvisioningState: succeeded, CreationTime: metav1.NewTime(baseTime.Add(4 * time.Hour))}),
"bin": makeAMPM(ampmOptions{Ready: true, LatestModel: true, ProvisioningState: succeeded, CreationTime: metav1.NewTime(baseTime.Add(3 * time.Hour))}),
"baz": makeAMPM(ampmOptions{Ready: true, LatestModel: true, ProvisioningState: succeeded, CreationTime: metav1.NewTime(baseTime.Add(2 * time.Hour))}),
},
want: gomega.DiffEq([]infrav1exp.AzureMachinePoolMachine{
makeAMPM(ampmOptions{Ready: true, LatestModel: true, ProvisioningState: succeeded, CreationTime: metav1.NewTime(baseTime.Add(2 * time.Hour))}),
}),
},
{
name: "if maxUnavailable is 30%, and there are 2 with the latest model == false, delete 0.",
strategy: makeRollingUpdateStrategy(infrav1exp.MachineRollingUpdateDeployment{MaxUnavailable: &thirtyPercent}),
Expand All @@ -400,6 +429,7 @@ func TestMachinePoolRollingUpdateStrategy_SelectMachinesToDelete(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
g := NewWithT(t)
featuregatetesting.SetFeatureGateDuringTest(t, feature.Gates, feature.SkipMachinePoolModelReconciliation, tt.skipModel)
got, err := tt.strategy.SelectMachinesToDelete(t.Context(), tt.desiredReplicas, tt.input)
if tt.errStr == "" {
g.Expect(err).To(Succeed())
Expand Down
26 changes: 26 additions & 0 deletions docs/book/src/self-managed/machinepools.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,32 @@ spec:
type: RollingUpdate
```

### Skipping Model Reconciliation
- **Feature status:** Experimental (Alpha)
- **Feature gate:** SkipMachinePoolModelReconciliation=false
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be SkipMachinePoolModelReconciliation=true or is it indicating the default is false?

- **Default value:** false (disabled)

By default, when the VMSS model changes (for example, when the OS image, VM SKU, or any other field that maps to the
underlying VMSS model is updated), CAPZ will progressively replace existing VMSS instances so that every instance is
running the latest model. This is implemented in the `AzureMachinePool` reconciler by requeueing as long as any instance
is detected to be running a stale model, and by prioritizing stale-model instances when selecting machines to delete.

The `SkipMachinePoolModelReconciliation` feature gate disables that automatic convergence. When the gate is enabled:

- `AzureMachinePool` will not requeue solely because one or more VMSS instances are running a stale model.
- The rolling update strategy will not preferentially delete stale-model instances; deletion is driven only by the
configured `deletePolicy` (e.g., `Oldest`, `Newest`, `Random`) and `maxUnavailable` / `maxSurge` budgets.
- Existing instances on a previous model will persist until the pool is explicitly scaled, an instance is manually
deleted, or the user otherwise triggers replacement.

This gate does **not** prevent CAPZ from updating the underlying VMSS template when the `AzureMachinePool` spec
changes, and it does **not** block surge behavior in the VMSS reconciler itself. It only controls whether CAPZ will
proactively replace instances running an older model with new instances (running the latest model).

This is useful for testing scenarios and for operators who want to manage instance refresh on their own schedule
without disabling other reconciliation behavior. To enable it, set `EXP_SKIP_MACHINE_POOL_MODEL_RECONCILIATION=true`
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is this environment variable wired? It doesn't look like it's being read in manager.yaml like the other feature gates.

in the CAPZ controller-manager environment (or pass `--feature-gates=SkipMachinePoolModelReconciliation=true`).

### AzureMachinePoolMachines
`AzureMachinePoolMachine` represents a virtual machine in the scale set. `AzureMachinePoolMachines` are created by the
`AzureMachinePool` controller and are used to track the life cycle of a virtual machine in the scale set. When a
Expand Down
22 changes: 17 additions & 5 deletions feature/feature.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,17 @@ const (
// owner: @nawazkh
// alpha: v1.18
APIServerILB featuregate.Feature = "APIServerILB"

// SkipMachinePoolModelReconciliation is a CAPZ feature gate to skip automatic reconciliation
// of AzureMachinePool instances to match the latest VMSS model. When enabled, instances
// with stale models will persist until the pool is explicitly scaled. This is useful for
// testing scenarios where model changes should not trigger automatic instance replacement.
Comment thread
jackfrancis marked this conversation as resolved.
// This gate does not prevent VMSS template updates and does not block surge behavior for
// model changes in the VMSS reconciler.
// Defaults to false.
// owner: @jackfrancis
// alpha: v1.24
SkipMachinePoolModelReconciliation featuregate.Feature = "SkipMachinePoolModelReconciliation"
)

func init() {
Expand All @@ -68,9 +79,10 @@ func init() {
// To add a new feature, define a key for it above and add it here.
var defaultCAPZFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
// Every feature should be initiated here:
AKS: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // Remove in 1.12
AKSResourceHealth: {Default: false, PreRelease: featuregate.Alpha},
EdgeZone: {Default: false, PreRelease: featuregate.Alpha},
ASOAPI: {Default: true, PreRelease: featuregate.GA},
APIServerILB: {Default: false, PreRelease: featuregate.Alpha},
AKS: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // Remove in 1.12
AKSResourceHealth: {Default: false, PreRelease: featuregate.Alpha},
EdgeZone: {Default: false, PreRelease: featuregate.Alpha},
ASOAPI: {Default: true, PreRelease: featuregate.GA},
APIServerILB: {Default: false, PreRelease: featuregate.Alpha},
SkipMachinePoolModelReconciliation: {Default: false, PreRelease: featuregate.Alpha},
}
Loading