Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions azure/scope/machinepool.go
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also add a test case to TestMachinePoolScope_NeedsRequeue to teset the new path as well? Something like "should not requeue if an instance VM image does not match the VMSS when SkipMachinePoolModelReconciliation is enabled". Sorry I missed this in the initial review.

Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ import (
"sigs.k8s.io/cluster-api-provider-azure/azure/services/scalesets"
"sigs.k8s.io/cluster-api-provider-azure/azure/services/virtualmachineimages"
infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-azure/feature"
azureutil "sigs.k8s.io/cluster-api-provider-azure/util/azure"
"sigs.k8s.io/cluster-api-provider-azure/util/futures"
"sigs.k8s.io/cluster-api-provider-azure/util/tele"
Expand Down Expand Up @@ -311,8 +312,12 @@ func (m *MachinePoolScope) NeedsRequeue() bool {
return state != nil && infrav1.IsTerminalProvisioningState(*state)
}

if !m.vmssState.HasLatestModelAppliedToAll() {
return true
// Skip requeue for model state when SkipMachinePoolModelReconciliation is enabled.
// This allows instances with stale models to persist until explicitly scaled.
if !feature.Gates.Enabled(feature.SkipMachinePoolModelReconciliation) {
if !m.vmssState.HasLatestModelAppliedToAll() {
return true
}
}

desiredMatchesActual := len(m.vmssState.Instances) == int(m.DesiredReplicas())
Expand Down
Comment thread
jackfrancis marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (

infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
"sigs.k8s.io/cluster-api-provider-azure/feature"
"sigs.k8s.io/cluster-api-provider-azure/util/tele"
)

Expand Down Expand Up @@ -182,14 +183,19 @@ func (rollingUpdateStrategy rollingUpdateStrategy) SelectMachinesToDelete(ctx co
// we have too many machines, let's choose the oldest to remove
if overProvisionCount > 0 {
var toDelete []infrav1exp.AzureMachinePoolMachine
log.Info("over-provisioned", "desiredReplicaCount", desiredReplicaCount, "overProvisionCount", overProvisionCount, "machinesWithoutLatestModel", getProviderIDs(machinesWithoutLatestModel))
// we are over-provisioned try to remove old models
for _, v := range machinesWithoutLatestModel {
if len(toDelete) >= overProvisionCount {
return toDelete, nil
log.Info("over-provisioned", "desiredReplicaCount", desiredReplicaCount, "overProvisionCount", overProvisionCount, "machinesWithoutLatestModel", getProviderIDs(machinesWithoutLatestModel), "skipModelReconciliation", feature.Gates.Enabled(feature.SkipMachinePoolModelReconciliation))

// When SkipMachinePoolModelReconciliation is enabled, skip prioritizing machines without latest model.
// Just delete the oldest ready machines to meet the desired replica count.
if !feature.Gates.Enabled(feature.SkipMachinePoolModelReconciliation) {
Comment thread
jackfrancis marked this conversation as resolved.
Outdated
// we are over-provisioned try to remove old models
for _, v := range machinesWithoutLatestModel {
if len(toDelete) >= overProvisionCount {
return toDelete, nil
}

toDelete = append(toDelete, v)
}

toDelete = append(toDelete, v)
}

log.Info("over-provisioned ready", "desiredReplicaCount", desiredReplicaCount, "overProvisionCount", overProvisionCount, "readyMachines", getProviderIDs(readyMachines))
Expand All @@ -205,7 +211,9 @@ func (rollingUpdateStrategy rollingUpdateStrategy) SelectMachinesToDelete(ctx co
return toDelete, nil
}

if len(machinesWithoutLatestModel) == 0 {
// When SkipMachinePoolModelReconciliation is disabled, check if all machines have latest model.
// When enabled, skip this check to allow stale model machines to persist.
if !feature.Gates.Enabled(feature.SkipMachinePoolModelReconciliation) && len(machinesWithoutLatestModel) == 0 {
Comment thread
jackfrancis marked this conversation as resolved.
Outdated
log.Info("nothing more to do since all the AzureMachinePoolMachine(s) are the latest model and not over-provisioned")
return []infrav1exp.AzureMachinePoolMachine{}, nil
}
Expand All @@ -216,12 +224,18 @@ func (rollingUpdateStrategy rollingUpdateStrategy) SelectMachinesToDelete(ctx co
}

var toDelete []infrav1exp.AzureMachinePoolMachine
log.Info("removing ready machines within disruption budget", "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "readyMachines", getProviderIDs(readyMachines), "readyMachinesCount", len(readyMachines))
log.Info("removing ready machines within disruption budget", "desiredReplicaCount", desiredReplicaCount, "maxUnavailable", maxUnavailable, "readyMachines", getProviderIDs(readyMachines), "readyMachinesCount", len(readyMachines), "skipModelReconciliation", feature.Gates.Enabled(feature.SkipMachinePoolModelReconciliation))
for _, v := range readyMachines {
if len(toDelete) >= disruptionBudget {
return toDelete, nil
}

// When SkipMachinePoolModelReconciliation is enabled, don't use LatestModelApplied as a deletion criterion.
// This allows machines with stale models to persist at steady desired replica count.
if feature.Gates.Enabled(feature.SkipMachinePoolModelReconciliation) {
continue
}
Comment thread
jackfrancis marked this conversation as resolved.
Outdated

if !v.Status.LatestModelApplied {
toDelete = append(toDelete, v)
}
Expand Down
20 changes: 15 additions & 5 deletions feature/feature.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,15 @@ const (
// owner: @nawazkh
// alpha: v1.18
APIServerILB featuregate.Feature = "APIServerILB"

// SkipMachinePoolModelReconciliation is a CAPZ feature gate to skip automatic reconciliation
// of AzureMachinePool instances to match the latest VMSS model. When enabled, instances
// with stale models will persist until the pool is explicitly scaled. This is useful for
// testing scenarios where model changes should not trigger automatic instance replacement.
Comment thread
jackfrancis marked this conversation as resolved.
// Defaults to false.
// owner: @jackfrancis
// alpha: v1.24
SkipMachinePoolModelReconciliation featuregate.Feature = "SkipMachinePoolModelReconciliation"
)

func init() {
Expand All @@ -68,9 +77,10 @@ func init() {
// To add a new feature, define a key for it above and add it here.
var defaultCAPZFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
// Every feature should be initiated here:
AKS: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // Remove in 1.12
AKSResourceHealth: {Default: false, PreRelease: featuregate.Alpha},
EdgeZone: {Default: false, PreRelease: featuregate.Alpha},
ASOAPI: {Default: true, PreRelease: featuregate.GA},
APIServerILB: {Default: false, PreRelease: featuregate.Alpha},
AKS: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // Remove in 1.12
AKSResourceHealth: {Default: false, PreRelease: featuregate.Alpha},
EdgeZone: {Default: false, PreRelease: featuregate.Alpha},
ASOAPI: {Default: true, PreRelease: featuregate.GA},
APIServerILB: {Default: false, PreRelease: featuregate.Alpha},
SkipMachinePoolModelReconciliation: {Default: false, PreRelease: featuregate.Alpha},
}
Loading