@@ -17,6 +17,7 @@ package nodeclass
1717import (
1818 "context"
1919 "fmt"
20+ "sort"
2021 "strings"
2122 "time"
2223
@@ -50,6 +51,7 @@ import (
5051
5152const (
5253 requeueAfterTime = 10 * time .Minute
54+ ConditionReasonNoInstanceTypesCompatible = "NoInstanceTypesCompatible"
5355 ConditionReasonCreateFleetAuthFailed = "CreateFleetAuthCheckFailed"
5456 ConditionReasonCreateLaunchTemplateAuthFailed = "CreateLaunchTemplateAuthCheckFailed"
5557 ConditionReasonRunInstancesAuthFailed = "RunInstancesAuthCheckFailed"
@@ -153,7 +155,11 @@ func (v *Validation) Reconcile(ctx context.Context, nodeClass *v1.EC2NodeClass)
153155 return reconcile.Result {}, reconcile .TerminalError (fmt .Errorf ("validating tags, %w" , err ))
154156 }
155157
156- if val , ok := v .cache .Get (v .cacheKey (nodeClass , tags )); ok {
158+ nodePools , err := nodepoolutils .ListManaged (ctx , v .kubeClient , v .cloudProvider , nodepoolutils .ForNodeClass (nodeClass ))
159+ if err != nil {
160+ return reconcile.Result {}, fmt .Errorf ("listing nodepools for nodeclass, %w" , err )
161+ }
162+ if val , ok := v .cache .Get (v .cacheKey (nodeClass , nodePools , tags )); ok {
157163 // We still update the status condition even if it's cached since we may have had a conflict error previously
158164 if val == "" {
159165 nodeClass .StatusConditions ().SetTrue (v1 .ConditionTypeValidationSucceeded )
@@ -169,32 +175,32 @@ func (v *Validation) Reconcile(ctx context.Context, nodeClass *v1.EC2NodeClass)
169175
170176 if v .dryRunDisabled {
171177 nodeClass .StatusConditions ().SetTrue (v1 .ConditionTypeValidationSucceeded )
172- v .cache .SetDefault (v .cacheKey (nodeClass , tags ), "" )
178+ v .cache .SetDefault (v .cacheKey (nodeClass , nodePools , tags ), "" )
173179 return reconcile.Result {RequeueAfter : requeueAfterTime }, nil
174180 }
175181
176- launchTemplate , result , err := v .validateCreateLaunchTemplateAuthorization (ctx , nodeClass , nodeClaim , tags )
182+ launchTemplate , result , err := v .validateCreateLaunchTemplateAuthorization (ctx , nodeClass , nodePools , nodeClaim , tags )
177183 if err != nil || ! lo .IsEmpty (result ) {
178184 return result , err
179185 }
180186
181- result , err = v .validateCreateFleetAuthorization (ctx , nodeClass , tags , launchTemplate )
187+ result , err = v .validateCreateFleetAuthorization (ctx , nodeClass , nodePools , tags , launchTemplate )
182188 if err != nil || ! lo .IsEmpty (result ) {
183189 return result , err
184190 }
185191
186- result , err = v .validateRunInstancesAuthorization (ctx , nodeClass , tags , launchTemplate )
192+ result , err = v .validateRunInstancesAuthorization (ctx , nodeClass , nodePools , tags , launchTemplate )
187193 if err != nil || ! lo .IsEmpty (result ) {
188194 return result , err
189195 }
190196
191- v .cache .SetDefault (v .cacheKey (nodeClass , tags ), "" )
197+ v .cache .SetDefault (v .cacheKey (nodeClass , nodePools , tags ), "" )
192198 nodeClass .StatusConditions ().SetTrue (v1 .ConditionTypeValidationSucceeded )
193199 return reconcile.Result {RequeueAfter : requeueAfterTime }, nil
194200}
195201
196- func (v * Validation ) updateCacheOnFailure (nodeClass * v1.EC2NodeClass , tags map [string ]string , failureReason string ) {
197- v .cache .SetDefault (v .cacheKey (nodeClass , tags ), failureReason )
202+ func (v * Validation ) updateCacheOnFailure (nodeClass * v1.EC2NodeClass , nodePools [] * karpv1. NodePool , tags map [string ]string , failureReason string ) {
203+ v .cache .SetDefault (v .cacheKey (nodeClass , nodePools , tags ), failureReason )
198204 nodeClass .StatusConditions ().SetFalse (
199205 v1 .ConditionTypeValidationSucceeded ,
200206 failureReason ,
@@ -205,13 +211,20 @@ func (v *Validation) updateCacheOnFailure(nodeClass *v1.EC2NodeClass, tags map[s
205211func (v * Validation ) validateCreateLaunchTemplateAuthorization (
206212 ctx context.Context ,
207213 nodeClass * v1.EC2NodeClass ,
214+ nodePools []* karpv1.NodePool ,
208215 nodeClaim * karpv1.NodeClaim ,
209216 tags map [string ]string ,
210217) (launchTemplate * launchtemplate.LaunchTemplate , result reconcile.Result , err error ) {
211- instanceTypes , err := v .getPrioritizedInstanceTypes (ctx , nodeClass )
218+ instanceTypes , err := v .getPrioritizedInstanceTypes (ctx , nodeClass , nodePools )
212219 if err != nil {
213220 return nil , reconcile.Result {}, fmt .Errorf ("generating options, %w" , err )
214221 }
222+ // this can only happen when the NodeClass has no compatible instance types, so we should invalidate it
223+ if len (instanceTypes ) == 0 {
224+ log .FromContext (ctx ).Error (fmt .Errorf ("getting instance types for NodeClass validation" ), "no instance type is compatible with NodeClass" )
225+ v .updateCacheOnFailure (nodeClass , nodePools , tags , ConditionReasonNoInstanceTypesCompatible )
226+ return nil , reconcile.Result {RequeueAfter : requeueAfterTime }, nil
227+ }
215228 // pass 1 instance type in EnsureAll to only create 1 launch template
216229 tenancyType , err := v .getTenancyType (ctx , nodeClass )
217230 if err != nil {
@@ -227,7 +240,7 @@ func (v *Validation) validateCreateLaunchTemplateAuthorization(
227240 return nil , reconcile.Result {}, fmt .Errorf ("validating ec2:CreateLaunchTemplate authorization, %w" , err )
228241 }
229242 log .FromContext (ctx ).Error (err , "unauthorized to call ec2:CreateLaunchTemplate" )
230- v .updateCacheOnFailure (nodeClass , tags , ConditionReasonCreateLaunchTemplateAuthFailed )
243+ v .updateCacheOnFailure (nodeClass , nodePools , tags , ConditionReasonCreateLaunchTemplateAuthFailed )
231244 return nil , reconcile.Result {RequeueAfter : requeueAfterTime }, nil
232245 }
233246 // this case should never occur as we ensure instance types are compatible with AMI
@@ -240,6 +253,7 @@ func (v *Validation) validateCreateLaunchTemplateAuthorization(
240253func (v * Validation ) validateCreateFleetAuthorization (
241254 ctx context.Context ,
242255 nodeClass * v1.EC2NodeClass ,
256+ nodePools []* karpv1.NodePool ,
243257 tags map [string ]string ,
244258 launchTemplate * launchtemplate.LaunchTemplate ,
245259) (result reconcile.Result , err error ) {
@@ -259,7 +273,7 @@ func (v *Validation) validateCreateFleetAuthorization(
259273 return reconcile.Result {}, fmt .Errorf ("validating ec2:CreateFleet authorization, %w" , err )
260274 }
261275 log .FromContext (ctx ).Error (err , "unauthorized to call ec2:CreateFleet" )
262- v .updateCacheOnFailure (nodeClass , tags , ConditionReasonCreateFleetAuthFailed )
276+ v .updateCacheOnFailure (nodeClass , nodePools , tags , ConditionReasonCreateFleetAuthFailed )
263277 return reconcile.Result {RequeueAfter : requeueAfterTime }, nil
264278 }
265279 return reconcile.Result {}, nil
@@ -268,6 +282,7 @@ func (v *Validation) validateCreateFleetAuthorization(
268282func (v * Validation ) validateRunInstancesAuthorization (
269283 ctx context.Context ,
270284 nodeClass * v1.EC2NodeClass ,
285+ nodePools []* karpv1.NodePool ,
271286 tags map [string ]string ,
272287 launchTemplate * launchtemplate.LaunchTemplate ,
273288) (result reconcile.Result , err error ) {
@@ -287,7 +302,7 @@ func (v *Validation) validateRunInstancesAuthorization(
287302 return reconcile.Result {}, fmt .Errorf ("validating ec2:RunInstances authorization, %w" , err )
288303 }
289304 log .FromContext (ctx ).Error (err , "unauthorized to call ec2:RunInstances" )
290- v .updateCacheOnFailure (nodeClass , tags , ConditionReasonRunInstancesAuthFailed )
305+ v .updateCacheOnFailure (nodeClass , nodePools , tags , ConditionReasonRunInstancesAuthFailed )
291306 return reconcile.Result {RequeueAfter : requeueAfterTime }, nil
292307 }
293308 return reconcile.Result {}, nil
@@ -302,7 +317,14 @@ func (*Validation) requiredConditions() []string {
302317 }
303318}
304319
305- func (* Validation ) cacheKey (nodeClass * v1.EC2NodeClass , tags map [string ]string ) string {
320+ func (* Validation ) cacheKey (nodeClass * v1.EC2NodeClass , nodePools []* karpv1.NodePool , tags map [string ]string ) string {
321+ sort .Slice (nodePools , func (i , j int ) bool {
322+ return nodePools [i ].Name < nodePools [j ].Name
323+ })
324+ nodePoolReqHashes := lo .Reduce (nodePools , func (agg []uint64 , np * karpv1.NodePool , _ int ) []uint64 {
325+ reqHash := lo .Must (hashstructure .Hash (np .Spec .Template .Spec .Requirements , hashstructure .FormatV2 , & hashstructure.HashOptions {SlicesAsSets : true }))
326+ return append (agg , reqHash )
327+ }, []uint64 {})
306328 hash := lo .Must (hashstructure .Hash ([]any {
307329 nodeClass .Status .Subnets ,
308330 nodeClass .Status .SecurityGroups ,
@@ -311,6 +333,7 @@ func (*Validation) cacheKey(nodeClass *v1.EC2NodeClass, tags map[string]string)
311333 nodeClass .Spec ,
312334 nodeClass .Annotations ,
313335 tags ,
336+ nodePoolReqHashes ,
314337 }, hashstructure .FormatV2 , & hashstructure.HashOptions {SlicesAsSets : true }))
315338 return fmt .Sprintf ("%s:%016x" , nodeClass .Name , hash )
316339}
@@ -396,26 +419,27 @@ func getFleetLaunchTemplateConfig(
396419 }
397420}
398421
399- func (v * Validation ) getPrioritizedInstanceTypes (ctx context.Context , nodeClass * v1.EC2NodeClass ) ([]* cloudprovider.InstanceType , error ) {
422+ func (v * Validation ) getPrioritizedInstanceTypes (ctx context.Context , nodeClass * v1.EC2NodeClass , nodePools [] * karpv1. NodePool ) ([]* cloudprovider.InstanceType , error ) {
400423 // Select an instance type to use for validation. If NodePools exist for this NodeClass, we'll use an instance type
401424 // selected by one of those NodePools. If no NodePools exists for this NodeClass, we'll use an instance type that could be
402425 // selected with an open NodePool. We should also prioritize an InstanceType which will launch with a non-GPU
403426 // (VariantStandard) AMI, since GPU AMIs may have a larger snapshot size than that supported by the NodeClass'
404427 // blockDeviceMappings.
405428 // Historical Issue: https://github.com/aws/karpenter-provider-aws/issues/7928
406- instanceTypes , err := v .getInstanceTypesForNodeClass (ctx , nodeClass )
429+ instanceTypes , err := v .getInstanceTypesForNodeClass (ctx , nodeClass , nodePools )
407430 if err != nil {
408431 return nil , err
409432 }
410-
411- // If there weren't any matching instance types, we should fallback to some defaults. There's an instance type included
412- // for both x86_64 and arm64 architectures, ensuring that there will be a matching AMI. We also fallback to the default
413- // instance types if the AMI family is Windows. Karpenter currently incorrectly marks certain instance types as Windows
414- // compatible, and dynamic instance type resolution may choose those instance types for the dry-run, even if they
433+ // If there weren't any matching instance types, we should invalidate the NodeClass.
434+ if len (instanceTypes ) == 0 {
435+ return nil , nil
436+ }
437+ // We fallback to the default instance type if the AMI family is Windows. Karpenter currently incorrectly marks certain instance
438+ // types as Windows compatible, and dynamic instance type resolution may choose those instance types for the dry-run, even if they
415439 // wouldn't be chosen due to cost in practice. This ensures the behavior matches that on Karpenter v1.3, preventing a
416440 // potential regression for Windows users.
417441 // Tracking issue: https://github.com/aws/karpenter-provider-aws/issues/7985
418- if len ( instanceTypes ) == 0 || lo .ContainsBy ([]string {
442+ if lo .ContainsBy ([]string {
419443 v1 .AMIFamilyWindows2019 ,
420444 v1 .AMIFamilyWindows2022 ,
421445 v1 .AMIFamilyWindows2025 ,
@@ -432,15 +456,6 @@ func (v *Validation) getPrioritizedInstanceTypes(ctx context.Context, nodeClass
432456 scheduling .NewRequirement (corev1 .LabelWindowsBuild , corev1 .NodeSelectorOpExists ),
433457 )... ),
434458 },
435- {
436- Name : string (ec2types .InstanceTypeM6gLarge ),
437- Requirements : scheduling .NewRequirements (append (
438- lo .Values (amifamily .VariantStandard .Requirements ()),
439- scheduling .NewRequirement (corev1 .LabelArchStable , corev1 .NodeSelectorOpIn , karpv1 .ArchitectureArm64 ),
440- scheduling .NewRequirement (corev1 .LabelOSStable , corev1 .NodeSelectorOpExists ),
441- scheduling .NewRequirement (corev1 .LabelWindowsBuild , corev1 .NodeSelectorOpExists ),
442- )... ),
443- },
444459 }
445460 instanceTypes = getAMICompatibleInstanceTypes (instanceTypes , nodeClass )
446461 }
@@ -451,16 +466,12 @@ func (v *Validation) getPrioritizedInstanceTypes(ctx context.Context, nodeClass
451466// getInstanceTypesForNodeClass returns the set of instances which could be launched using this NodeClass based on the
452467// requirements of linked NodePools. If no NodePools exist, this function returns the instances which could be launched using
453468// the NodeClass and an open NodePool.
454- func (v * Validation ) getInstanceTypesForNodeClass (ctx context.Context , nodeClass * v1.EC2NodeClass ) ([]* cloudprovider.InstanceType , error ) {
469+ func (v * Validation ) getInstanceTypesForNodeClass (ctx context.Context , nodeClass * v1.EC2NodeClass , nodePools [] * karpv1. NodePool ) ([]* cloudprovider.InstanceType , error ) {
455470 instanceTypes , err := v .instanceTypeProvider .List (ctx , nodeClass )
456471 if err != nil {
457472 return nil , fmt .Errorf ("listing instance types for nodeclass, %w" , err )
458473 }
459474 instanceTypes = v .instanceTypeProvider .FilterForNodeClass (instanceTypes , nodeClass )
460- nodePools , err := nodepoolutils .ListManaged (ctx , v .kubeClient , v .cloudProvider , nodepoolutils .ForNodeClass (nodeClass ))
461- if err != nil {
462- return nil , fmt .Errorf ("listing nodepools for nodeclass, %w" , err )
463- }
464475 if len (nodePools ) == 0 {
465476 return getAMICompatibleInstanceTypes (instanceTypes , nodeClass ), nil
466477 }
0 commit comments