Skip to content

Commit bb68557

Browse files
committed
account for nodepool changes
1 parent 6081a82 commit bb68557

File tree

3 files changed

+86
-46
lines changed

3 files changed

+86
-46
lines changed

pkg/controllers/nodeclass/controller.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,25 @@ func (c *Controller) Register(_ context.Context, m manager.Manager) error {
246246
DeleteFunc: func(e event.DeleteEvent) bool { return true },
247247
}),
248248
).
249+
Watches(
250+
&karpv1.NodePool{},
251+
handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, o client.Object) []reconcile.Request {
252+
np := o.(*karpv1.NodePool)
253+
if np.Spec.Template.Spec.NodeClassRef == nil {
254+
return nil
255+
}
256+
return []reconcile.Request{{NamespacedName: types.NamespacedName{Name: np.Spec.Template.Spec.NodeClassRef.Name}}}
257+
}),
258+
builder.WithPredicates(predicate.Funcs{
259+
CreateFunc: func(e event.CreateEvent) bool { return true },
260+
UpdateFunc: func(e event.UpdateEvent) bool {
261+
oldNP := e.ObjectOld.(*karpv1.NodePool)
262+
newNP := e.ObjectNew.(*karpv1.NodePool)
263+
return !equality.Semantic.DeepEqual(oldNP.Spec.Template.Spec.Requirements, newNP.Spec.Template.Spec.Requirements)
264+
},
265+
DeleteFunc: func(e event.DeleteEvent) bool { return true },
266+
}),
267+
).
249268
WithOptions(controller.Options{
250269
RateLimiter: reasonable.RateLimiter(),
251270
MaxConcurrentReconciles: 10,

pkg/controllers/nodeclass/validation.go

Lines changed: 46 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ package nodeclass
1717
import (
1818
"context"
1919
"fmt"
20+
"sort"
2021
"strings"
2122
"time"
2223

@@ -50,6 +51,7 @@ import (
5051

5152
const (
5253
requeueAfterTime = 10 * time.Minute
54+
ConditionReasonNoInstanceTypesCompatible = "NoInstanceTypesCompatible"
5355
ConditionReasonCreateFleetAuthFailed = "CreateFleetAuthCheckFailed"
5456
ConditionReasonCreateLaunchTemplateAuthFailed = "CreateLaunchTemplateAuthCheckFailed"
5557
ConditionReasonRunInstancesAuthFailed = "RunInstancesAuthCheckFailed"
@@ -153,7 +155,11 @@ func (v *Validation) Reconcile(ctx context.Context, nodeClass *v1.EC2NodeClass)
153155
return reconcile.Result{}, reconcile.TerminalError(fmt.Errorf("validating tags, %w", err))
154156
}
155157

156-
if val, ok := v.cache.Get(v.cacheKey(nodeClass, tags)); ok {
158+
nodePools, err := nodepoolutils.ListManaged(ctx, v.kubeClient, v.cloudProvider, nodepoolutils.ForNodeClass(nodeClass))
159+
if err != nil {
160+
return reconcile.Result{}, fmt.Errorf("listing nodepools for nodeclass, %w", err)
161+
}
162+
if val, ok := v.cache.Get(v.cacheKey(nodeClass, nodePools, tags)); ok {
157163
// We still update the status condition even if it's cached since we may have had a conflict error previously
158164
if val == "" {
159165
nodeClass.StatusConditions().SetTrue(v1.ConditionTypeValidationSucceeded)
@@ -169,32 +175,32 @@ func (v *Validation) Reconcile(ctx context.Context, nodeClass *v1.EC2NodeClass)
169175

170176
if v.dryRunDisabled {
171177
nodeClass.StatusConditions().SetTrue(v1.ConditionTypeValidationSucceeded)
172-
v.cache.SetDefault(v.cacheKey(nodeClass, tags), "")
178+
v.cache.SetDefault(v.cacheKey(nodeClass, nodePools, tags), "")
173179
return reconcile.Result{RequeueAfter: requeueAfterTime}, nil
174180
}
175181

176-
launchTemplate, result, err := v.validateCreateLaunchTemplateAuthorization(ctx, nodeClass, nodeClaim, tags)
182+
launchTemplate, result, err := v.validateCreateLaunchTemplateAuthorization(ctx, nodeClass, nodePools, nodeClaim, tags)
177183
if err != nil || !lo.IsEmpty(result) {
178184
return result, err
179185
}
180186

181-
result, err = v.validateCreateFleetAuthorization(ctx, nodeClass, tags, launchTemplate)
187+
result, err = v.validateCreateFleetAuthorization(ctx, nodeClass, nodePools, tags, launchTemplate)
182188
if err != nil || !lo.IsEmpty(result) {
183189
return result, err
184190
}
185191

186-
result, err = v.validateRunInstancesAuthorization(ctx, nodeClass, tags, launchTemplate)
192+
result, err = v.validateRunInstancesAuthorization(ctx, nodeClass, nodePools, tags, launchTemplate)
187193
if err != nil || !lo.IsEmpty(result) {
188194
return result, err
189195
}
190196

191-
v.cache.SetDefault(v.cacheKey(nodeClass, tags), "")
197+
v.cache.SetDefault(v.cacheKey(nodeClass, nodePools, tags), "")
192198
nodeClass.StatusConditions().SetTrue(v1.ConditionTypeValidationSucceeded)
193199
return reconcile.Result{RequeueAfter: requeueAfterTime}, nil
194200
}
195201

196-
func (v *Validation) updateCacheOnFailure(nodeClass *v1.EC2NodeClass, tags map[string]string, failureReason string) {
197-
v.cache.SetDefault(v.cacheKey(nodeClass, tags), failureReason)
202+
func (v *Validation) updateCacheOnFailure(nodeClass *v1.EC2NodeClass, nodePools []*karpv1.NodePool, tags map[string]string, failureReason string) {
203+
v.cache.SetDefault(v.cacheKey(nodeClass, nodePools, tags), failureReason)
198204
nodeClass.StatusConditions().SetFalse(
199205
v1.ConditionTypeValidationSucceeded,
200206
failureReason,
@@ -205,13 +211,20 @@ func (v *Validation) updateCacheOnFailure(nodeClass *v1.EC2NodeClass, tags map[s
205211
func (v *Validation) validateCreateLaunchTemplateAuthorization(
206212
ctx context.Context,
207213
nodeClass *v1.EC2NodeClass,
214+
nodePools []*karpv1.NodePool,
208215
nodeClaim *karpv1.NodeClaim,
209216
tags map[string]string,
210217
) (launchTemplate *launchtemplate.LaunchTemplate, result reconcile.Result, err error) {
211-
instanceTypes, err := v.getPrioritizedInstanceTypes(ctx, nodeClass)
218+
instanceTypes, err := v.getPrioritizedInstanceTypes(ctx, nodeClass, nodePools)
212219
if err != nil {
213220
return nil, reconcile.Result{}, fmt.Errorf("generating options, %w", err)
214221
}
222+
// this can only happen when the NodeClass has no compatible instance types, so we should invalidate it
223+
if len(instanceTypes) == 0 {
224+
log.FromContext(ctx).Error(fmt.Errorf("getting instance types for NodeClass validation"), "no instance type is compatible with NodeClass")
225+
v.updateCacheOnFailure(nodeClass, nodePools, tags, ConditionReasonNoInstanceTypesCompatible)
226+
return nil, reconcile.Result{RequeueAfter: requeueAfterTime}, nil
227+
}
215228
// pass 1 instance type in EnsureAll to only create 1 launch template
216229
tenancyType, err := v.getTenancyType(ctx, nodeClass)
217230
if err != nil {
@@ -227,7 +240,7 @@ func (v *Validation) validateCreateLaunchTemplateAuthorization(
227240
return nil, reconcile.Result{}, fmt.Errorf("validating ec2:CreateLaunchTemplate authorization, %w", err)
228241
}
229242
log.FromContext(ctx).Error(err, "unauthorized to call ec2:CreateLaunchTemplate")
230-
v.updateCacheOnFailure(nodeClass, tags, ConditionReasonCreateLaunchTemplateAuthFailed)
243+
v.updateCacheOnFailure(nodeClass, nodePools, tags, ConditionReasonCreateLaunchTemplateAuthFailed)
231244
return nil, reconcile.Result{RequeueAfter: requeueAfterTime}, nil
232245
}
233246
// this case should never occur as we ensure instance types are compatible with AMI
@@ -240,6 +253,7 @@ func (v *Validation) validateCreateLaunchTemplateAuthorization(
240253
func (v *Validation) validateCreateFleetAuthorization(
241254
ctx context.Context,
242255
nodeClass *v1.EC2NodeClass,
256+
nodePools []*karpv1.NodePool,
243257
tags map[string]string,
244258
launchTemplate *launchtemplate.LaunchTemplate,
245259
) (result reconcile.Result, err error) {
@@ -259,7 +273,7 @@ func (v *Validation) validateCreateFleetAuthorization(
259273
return reconcile.Result{}, fmt.Errorf("validating ec2:CreateFleet authorization, %w", err)
260274
}
261275
log.FromContext(ctx).Error(err, "unauthorized to call ec2:CreateFleet")
262-
v.updateCacheOnFailure(nodeClass, tags, ConditionReasonCreateFleetAuthFailed)
276+
v.updateCacheOnFailure(nodeClass, nodePools, tags, ConditionReasonCreateFleetAuthFailed)
263277
return reconcile.Result{RequeueAfter: requeueAfterTime}, nil
264278
}
265279
return reconcile.Result{}, nil
@@ -268,6 +282,7 @@ func (v *Validation) validateCreateFleetAuthorization(
268282
func (v *Validation) validateRunInstancesAuthorization(
269283
ctx context.Context,
270284
nodeClass *v1.EC2NodeClass,
285+
nodePools []*karpv1.NodePool,
271286
tags map[string]string,
272287
launchTemplate *launchtemplate.LaunchTemplate,
273288
) (result reconcile.Result, err error) {
@@ -287,7 +302,7 @@ func (v *Validation) validateRunInstancesAuthorization(
287302
return reconcile.Result{}, fmt.Errorf("validating ec2:RunInstances authorization, %w", err)
288303
}
289304
log.FromContext(ctx).Error(err, "unauthorized to call ec2:RunInstances")
290-
v.updateCacheOnFailure(nodeClass, tags, ConditionReasonRunInstancesAuthFailed)
305+
v.updateCacheOnFailure(nodeClass, nodePools, tags, ConditionReasonRunInstancesAuthFailed)
291306
return reconcile.Result{RequeueAfter: requeueAfterTime}, nil
292307
}
293308
return reconcile.Result{}, nil
@@ -302,7 +317,14 @@ func (*Validation) requiredConditions() []string {
302317
}
303318
}
304319

305-
func (*Validation) cacheKey(nodeClass *v1.EC2NodeClass, tags map[string]string) string {
320+
func (*Validation) cacheKey(nodeClass *v1.EC2NodeClass, nodePools []*karpv1.NodePool, tags map[string]string) string {
321+
sort.Slice(nodePools, func(i, j int) bool {
322+
return nodePools[i].Name < nodePools[j].Name
323+
})
324+
nodePoolReqHashes := lo.Reduce(nodePools, func(agg []uint64, np *karpv1.NodePool, _ int) []uint64 {
325+
reqHash := lo.Must(hashstructure.Hash(np.Spec.Template.Spec.Requirements, hashstructure.FormatV2, &hashstructure.HashOptions{SlicesAsSets: true}))
326+
return append(agg, reqHash)
327+
}, []uint64{})
306328
hash := lo.Must(hashstructure.Hash([]any{
307329
nodeClass.Status.Subnets,
308330
nodeClass.Status.SecurityGroups,
@@ -311,6 +333,7 @@ func (*Validation) cacheKey(nodeClass *v1.EC2NodeClass, tags map[string]string)
311333
nodeClass.Spec,
312334
nodeClass.Annotations,
313335
tags,
336+
nodePoolReqHashes,
314337
}, hashstructure.FormatV2, &hashstructure.HashOptions{SlicesAsSets: true}))
315338
return fmt.Sprintf("%s:%016x", nodeClass.Name, hash)
316339
}
@@ -396,26 +419,27 @@ func getFleetLaunchTemplateConfig(
396419
}
397420
}
398421

399-
func (v *Validation) getPrioritizedInstanceTypes(ctx context.Context, nodeClass *v1.EC2NodeClass) ([]*cloudprovider.InstanceType, error) {
422+
func (v *Validation) getPrioritizedInstanceTypes(ctx context.Context, nodeClass *v1.EC2NodeClass, nodePools []*karpv1.NodePool) ([]*cloudprovider.InstanceType, error) {
400423
// Select an instance type to use for validation. If NodePools exist for this NodeClass, we'll use an instance type
401424
// selected by one of those NodePools. If no NodePools exists for this NodeClass, we'll use an instance type that could be
402425
// selected with an open NodePool. We should also prioritize an InstanceType which will launch with a non-GPU
403426
// (VariantStandard) AMI, since GPU AMIs may have a larger snapshot size than that supported by the NodeClass'
404427
// blockDeviceMappings.
405428
// Historical Issue: https://github.com/aws/karpenter-provider-aws/issues/7928
406-
instanceTypes, err := v.getInstanceTypesForNodeClass(ctx, nodeClass)
429+
instanceTypes, err := v.getInstanceTypesForNodeClass(ctx, nodeClass, nodePools)
407430
if err != nil {
408431
return nil, err
409432
}
410-
411-
// If there weren't any matching instance types, we should fallback to some defaults. There's an instance type included
412-
// for both x86_64 and arm64 architectures, ensuring that there will be a matching AMI. We also fallback to the default
413-
// instance types if the AMI family is Windows. Karpenter currently incorrectly marks certain instance types as Windows
414-
// compatible, and dynamic instance type resolution may choose those instance types for the dry-run, even if they
433+
// If there weren't any matching instance types, we should invalidate the NodeClass.
434+
if len(instanceTypes) == 0 {
435+
return nil, nil
436+
}
437+
// We fallback to the default instance type if the AMI family is Windows. Karpenter currently incorrectly marks certain instance
438+
// types as Windows compatible, and dynamic instance type resolution may choose those instance types for the dry-run, even if they
415439
// wouldn't be chosen due to cost in practice. This ensures the behavior matches that on Karpenter v1.3, preventing a
416440
// potential regression for Windows users.
417441
// Tracking issue: https://github.com/aws/karpenter-provider-aws/issues/7985
418-
if len(instanceTypes) == 0 || lo.ContainsBy([]string{
442+
if lo.ContainsBy([]string{
419443
v1.AMIFamilyWindows2019,
420444
v1.AMIFamilyWindows2022,
421445
v1.AMIFamilyWindows2025,
@@ -432,15 +456,6 @@ func (v *Validation) getPrioritizedInstanceTypes(ctx context.Context, nodeClass
432456
scheduling.NewRequirement(corev1.LabelWindowsBuild, corev1.NodeSelectorOpExists),
433457
)...),
434458
},
435-
{
436-
Name: string(ec2types.InstanceTypeM6gLarge),
437-
Requirements: scheduling.NewRequirements(append(
438-
lo.Values(amifamily.VariantStandard.Requirements()),
439-
scheduling.NewRequirement(corev1.LabelArchStable, corev1.NodeSelectorOpIn, karpv1.ArchitectureArm64),
440-
scheduling.NewRequirement(corev1.LabelOSStable, corev1.NodeSelectorOpExists),
441-
scheduling.NewRequirement(corev1.LabelWindowsBuild, corev1.NodeSelectorOpExists),
442-
)...),
443-
},
444459
}
445460
instanceTypes = getAMICompatibleInstanceTypes(instanceTypes, nodeClass)
446461
}
@@ -451,16 +466,12 @@ func (v *Validation) getPrioritizedInstanceTypes(ctx context.Context, nodeClass
451466
// getInstanceTypesForNodeClass returns the set of instances which could be launched using this NodeClass based on the
452467
// requirements of linked NodePools. If no NodePools exist, this function returns the instances which could be launched using
453468
// the NodeClass and an open NodePool.
454-
func (v *Validation) getInstanceTypesForNodeClass(ctx context.Context, nodeClass *v1.EC2NodeClass) ([]*cloudprovider.InstanceType, error) {
469+
func (v *Validation) getInstanceTypesForNodeClass(ctx context.Context, nodeClass *v1.EC2NodeClass, nodePools []*karpv1.NodePool) ([]*cloudprovider.InstanceType, error) {
455470
instanceTypes, err := v.instanceTypeProvider.List(ctx, nodeClass)
456471
if err != nil {
457472
return nil, fmt.Errorf("listing instance types for nodeclass, %w", err)
458473
}
459474
instanceTypes = v.instanceTypeProvider.FilterForNodeClass(instanceTypes, nodeClass)
460-
nodePools, err := nodepoolutils.ListManaged(ctx, v.kubeClient, v.cloudProvider, nodepoolutils.ForNodeClass(nodeClass))
461-
if err != nil {
462-
return nil, fmt.Errorf("listing nodepools for nodeclass, %w", err)
463-
}
464475
if len(nodePools) == 0 {
465476
return getAMICompatibleInstanceTypes(instanceTypes, nodeClass), nil
466477
}

pkg/controllers/nodeclass/validation_test.go

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ package nodeclass_test
1616

1717
import (
1818
"fmt"
19-
"strings"
2019
"time"
2120

2221
"github.com/awslabs/operatorpkg/object"
@@ -309,24 +308,35 @@ var _ = Describe("NodeClass Validation Status Controller", func() {
309308
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed())
310309
})
311310
DescribeTable(
312-
"should fallback to static instance types when no linked NodePools exist",
313-
func(expectedInstanceType ec2types.InstanceType, expectedAMIID string) {
314-
// Filter out the non-target standard AMI to ensure the right instance is selected, but leave the nvidia AMI to
315-
// test AMI selection.
316-
awsEnv.SSMAPI.Parameters = lo.PickBy(awsEnv.SSMAPI.Parameters, func(_, amiID string) bool {
317-
return amiID == expectedAMIID || strings.Contains(amiID, "nvidia")
311+
"should fallback to static instance types when windows ami is used",
312+
func(family string, terms []v1.AMISelectorTerm, expectedInstanceType ec2types.InstanceType, expectedAMIID string) {
313+
// Skip Windows 2025 on versions < 1.35
314+
if family == v1.AMIFamilyWindows2025 && version.MustParseGeneric(awsEnv.VersionProvider.Get(ctx)).Minor() < 35 {
315+
Skip("Windows 2025 requires EKS version 1.35+, current version: " + awsEnv.VersionProvider.Get(ctx))
316+
}
317+
nodeClass.Spec.AMIFamily = lo.ToPtr(family)
318+
nodeClass.Spec.AMISelectorTerms = []v1.AMISelectorTerm{
319+
{
320+
Tags: map[string]string{"*": "*"},
321+
},
322+
}
323+
// Filter DescribeImages to use only the expected AMI
324+
allImages := awsEnv.EC2API.DescribeImagesOutput.Clone().Images
325+
awsEnv.EC2API.DescribeImagesOutput.Set(&ec2.DescribeImagesOutput{
326+
Images: lo.Filter(allImages, func(img ec2types.Image, _ int) bool {
327+
return lo.FromPtr(img.ImageId) == expectedAMIID
328+
}),
318329
})
319-
Expect(len(awsEnv.SSMAPI.Parameters)).To(BeNumerically(">", 1))
320-
321330
ExpectApplied(ctx, env.Client, nodeClass)
322331
ExpectObjectReconciled(ctx, env.Client, controller, nodeClass)
323332
launchTemplateInput := awsEnv.EC2API.CreateLaunchTemplateBehavior.CalledWithInput.Pop()
324333
runInstancesInput := awsEnv.EC2API.RunInstancesBehavior.CalledWithInput.Pop()
325334
Expect(runInstancesInput.InstanceType).To(Equal(expectedInstanceType))
326335
Expect(launchTemplateInput.LaunchTemplateData.ImageId).To(PointTo(Equal(expectedAMIID)))
327336
},
328-
Entry("m5.large", ec2types.InstanceTypeM5Large, "amd64-ami-id"),
329-
Entry("m6g.large", ec2types.InstanceTypeM6gLarge, "arm64-ami-id"),
337+
Entry("Windows2019 with m5.large", v1.AMIFamilyWindows2019, []v1.AMISelectorTerm{{Alias: "windows2019@latest"}}, ec2types.InstanceTypeM5Large, "amd64-ami-id"),
338+
Entry("Windows2022 with m5.large", v1.AMIFamilyWindows2022, []v1.AMISelectorTerm{{Alias: "windows2022@latest"}}, ec2types.InstanceTypeM5Large, "amd64-ami-id"),
339+
Entry("Windows2025 with m6g.large", v1.AMIFamilyWindows2025, []v1.AMISelectorTerm{{Alias: "windows2025@latest"}}, ec2types.InstanceTypeM6gLarge, "arm64-ami-id"),
330340
)
331341
It("should prioritize non-GPU instances", func() {
332342
nodePool := coretest.NodePool(karpv1.NodePool{Spec: karpv1.NodePoolSpec{Template: karpv1.NodeClaimTemplate{

0 commit comments

Comments
 (0)