Skip to content
Merged
122 changes: 60 additions & 62 deletions pkg/controllers/nodeclass/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,15 +208,17 @@ func (v *Validation) validateCreateLaunchTemplateAuthorization(
nodeClaim *karpv1.NodeClaim,
tags map[string]string,
) (launchTemplate *launchtemplate.LaunchTemplate, result reconcile.Result, err error) {
instanceTypes, err := v.getPrioritizedInstanceTypes(ctx, nodeClass)
nodePools, err := nodepoolutils.ListManaged(ctx, v.kubeClient, v.cloudProvider, nodepoolutils.ForNodeClass(nodeClass))
if err != nil {
return nil, reconcile.Result{}, fmt.Errorf("generating options, %w", err)
return nil, reconcile.Result{}, fmt.Errorf("listing nodepools for nodeclass, %w", err)
}
// pass 1 instance type in EnsureAll to only create 1 launch template
tenancyType, err := v.getTenancyType(ctx, nodeClass)
instanceTypes, err := v.getPrioritizedInstanceTypes(ctx, nodeClass, nodePools)
if err != nil {
return nil, reconcile.Result{}, fmt.Errorf("determining instance tenancy, %w", err)
return nil, reconcile.Result{}, fmt.Errorf("generating options, %w", err)
}
// pass 1 instance type in EnsureAll to only create 1 launch template
tenancyType := getTenancyType(nodePools)

launchTemplates, err := v.launchTemplateProvider.EnsureAll(ctx, nodeClass, nodeClaim, instanceTypes[:1], karpv1.CapacityTypeOnDemand, tags, string(tenancyType))
if err != nil {
if awserrors.IsRateLimitedError(err) || awserrors.IsServerError(err) {
Expand Down Expand Up @@ -396,16 +398,20 @@ func getFleetLaunchTemplateConfig(
}
}

func (v *Validation) getPrioritizedInstanceTypes(ctx context.Context, nodeClass *v1.EC2NodeClass) ([]*cloudprovider.InstanceType, error) {
// Select an instance type to use for validation. If NodePools exist for this NodeClass, we'll use an instance type
// selected by one of those NodePools. We should also prioritize an InstanceType which will launch with a non-GPU
// (VariantStandard) AMI, since GPU AMIs may have a larger snapshot size than that supported by the NodeClass'
// blockDeviceMappings.
// getPrioritizedInstanceTypes returns the set of instances which could be launched using this NodeClass based on the
// requirements of linked NodePools. If no NodePools exist for the given NodeClass, this function returns two default
// instance types (one x86_64 and one arm64). If the 2 default instance types are not compatible with the NodeClass,
// this function we'll use an instance type that could be selected with an open NodePool.
func (v *Validation) getPrioritizedInstanceTypes(ctx context.Context, nodeClass *v1.EC2NodeClass, nodePools []*karpv1.NodePool) ([]*cloudprovider.InstanceType, error) {
// We should prioritize an InstanceType which will launch with a non-GPU (VariantStandard) AMI, since GPU
// AMIs may have a larger snapshot size than that supported by the NodeClass' blockDeviceMappings.
// Historical Issue: https://github.com/aws/karpenter-provider-aws/issues/7928
instanceTypes, err := v.getInstanceTypesForNodeClass(ctx, nodeClass)
instanceTypes, err := v.instanceTypeProvider.List(ctx, nodeClass)
if err != nil {
return nil, err
return nil, fmt.Errorf("listing instance types for nodeclass, %w", err)
}
instanceTypes = v.instanceTypeProvider.FilterForNodeClass(instanceTypes, nodeClass)
compatibleInstanceTypes := getNodePoolCompatibleInstanceTypes(instanceTypes, nodePools)

// If there weren't any matching instance types, we should fallback to some defaults. There's an instance type included
// for both x86_64 and arm64 architectures, ensuring that there will be a matching AMI. We also fallback to the default
Expand All @@ -414,51 +420,58 @@ func (v *Validation) getPrioritizedInstanceTypes(ctx context.Context, nodeClass
// wouldn't be chosen due to cost in practice. This ensures the behavior matches that on Karpenter v1.3, preventing a
// potential regression for Windows users.
// Tracking issue: https://github.com/aws/karpenter-provider-aws/issues/7985
if len(instanceTypes) == 0 || lo.ContainsBy([]string{
if len(compatibleInstanceTypes) == 0 || lo.ContainsBy([]string{
v1.AMIFamilyWindows2019,
v1.AMIFamilyWindows2022,
v1.AMIFamilyWindows2025,
}, func(family string) bool {
return family == nodeClass.AMIFamily()
}) {
instanceTypes = []*cloudprovider.InstanceType{
{
Name: string(ec2types.InstanceTypeM5Large),
Requirements: scheduling.NewRequirements(append(
lo.Values(amifamily.VariantStandard.Requirements()),
scheduling.NewRequirement(corev1.LabelArchStable, corev1.NodeSelectorOpIn, karpv1.ArchitectureAmd64),
scheduling.NewRequirement(corev1.LabelOSStable, corev1.NodeSelectorOpExists),
scheduling.NewRequirement(corev1.LabelWindowsBuild, corev1.NodeSelectorOpExists),
)...),
},
{
Name: string(ec2types.InstanceTypeM6gLarge),
Requirements: scheduling.NewRequirements(append(
lo.Values(amifamily.VariantStandard.Requirements()),
scheduling.NewRequirement(corev1.LabelArchStable, corev1.NodeSelectorOpIn, karpv1.ArchitectureArm64),
scheduling.NewRequirement(corev1.LabelOSStable, corev1.NodeSelectorOpExists),
scheduling.NewRequirement(corev1.LabelWindowsBuild, corev1.NodeSelectorOpExists),
)...),
},
}
instanceTypes = getAMICompatibleInstanceTypes(instanceTypes, nodeClass)
compatibleInstanceTypes = v.getFallbackInstanceTypes(instanceTypes)
}

return instanceTypes, nil
return getAMICompatibleInstanceTypes(compatibleInstanceTypes, nodeClass), nil
}

// getInstanceTypesForNodeClass returns the set of instances which could be launched using this NodeClass based on the
// requirements of linked NodePools. If no NodePools exist for the given NodeClass, this function returns two default
// instance types (one x86_64 and one arm64).
func (v *Validation) getInstanceTypesForNodeClass(ctx context.Context, nodeClass *v1.EC2NodeClass) ([]*cloudprovider.InstanceType, error) {
instanceTypes, err := v.instanceTypeProvider.List(ctx, nodeClass)
if err != nil {
return nil, fmt.Errorf("listing instance types for nodeclass, %w", err)
func (v *Validation) getFallbackInstanceTypes(instanceTypes []*cloudprovider.InstanceType) []*cloudprovider.InstanceType {
fallbackInstanceTypes := []*cloudprovider.InstanceType{
{
Name: string(ec2types.InstanceTypeM5Large),
Requirements: scheduling.NewRequirements(append(
lo.Values(amifamily.VariantStandard.Requirements()),
scheduling.NewRequirement(corev1.LabelArchStable, corev1.NodeSelectorOpIn, karpv1.ArchitectureAmd64),
scheduling.NewRequirement(corev1.LabelOSStable, corev1.NodeSelectorOpExists),
scheduling.NewRequirement(corev1.LabelWindowsBuild, corev1.NodeSelectorOpExists),
)...),
},
{
Name: string(ec2types.InstanceTypeM6gLarge),
Requirements: scheduling.NewRequirements(append(
lo.Values(amifamily.VariantStandard.Requirements()),
scheduling.NewRequirement(corev1.LabelArchStable, corev1.NodeSelectorOpIn, karpv1.ArchitectureArm64),
scheduling.NewRequirement(corev1.LabelOSStable, corev1.NodeSelectorOpExists),
scheduling.NewRequirement(corev1.LabelWindowsBuild, corev1.NodeSelectorOpExists),
)...),
},
}
nodePools, err := nodepoolutils.ListManaged(ctx, v.kubeClient, v.cloudProvider, nodepoolutils.ForNodeClass(nodeClass))
if err != nil {
return nil, fmt.Errorf("listing nodepools for nodeclass, %w", err)
fallbackInstanceTypes = lo.Filter(fallbackInstanceTypes, func(itFallback *cloudprovider.InstanceType, _ int) bool {
return lo.ContainsBy(instanceTypes, func(it *cloudprovider.InstanceType) bool {
return it.Name == itFallback.Name
})
})
return lo.Ternary(len(fallbackInstanceTypes) == 0, instanceTypes, fallbackInstanceTypes)
}

func getTenancyType(nodePools []*karpv1.NodePool) ec2types.Tenancy {
for _, np := range nodePools {
reqs := scheduling.NewNodeSelectorRequirementsWithMinValues(np.Spec.Template.Spec.Requirements...)
if reqs.Has(v1.LabelInstanceTenancy) && reqs.Get(v1.LabelInstanceTenancy).Has(string(ec2types.TenancyDedicated)) {
return ec2types.TenancyDedicated
}
}
return ec2types.TenancyDefault
}

func getNodePoolCompatibleInstanceTypes(instanceTypes []*cloudprovider.InstanceType, nodePools []*karpv1.NodePool) []*cloudprovider.InstanceType {
var compatibleInstanceTypes []*cloudprovider.InstanceType
names := sets.New[string]()
for _, np := range nodePools {
Expand All @@ -477,22 +490,7 @@ func (v *Validation) getInstanceTypesForNodeClass(ctx context.Context, nodeClass
compatibleInstanceTypes = append(compatibleInstanceTypes, it)
}
}
return getAMICompatibleInstanceTypes(compatibleInstanceTypes, nodeClass), nil
}

func (v *Validation) getTenancyType(ctx context.Context, nodeClass *v1.EC2NodeClass) (ec2types.Tenancy, error) {
nodePools, err := nodepoolutils.ListManaged(ctx, v.kubeClient, v.cloudProvider, nodepoolutils.ForNodeClass(nodeClass))
if err != nil {
return "", fmt.Errorf("listing nodepools for nodeclass, %w", err)
}

for _, np := range nodePools {
reqs := scheduling.NewNodeSelectorRequirementsWithMinValues(np.Spec.Template.Spec.Requirements...)
if reqs.Has(v1.LabelInstanceTenancy) && reqs.Get(v1.LabelInstanceTenancy).Has(string(ec2types.TenancyDedicated)) {
return ec2types.TenancyDedicated, nil
}
}
return ec2types.TenancyDefault, nil
return compatibleInstanceTypes
}

func getAMICompatibleInstanceTypes(instanceTypes []*cloudprovider.InstanceType, nodeClass *v1.EC2NodeClass) []*cloudprovider.InstanceType {
Expand Down
75 changes: 54 additions & 21 deletions pkg/controllers/nodeclass/validation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ package nodeclass_test

import (
"fmt"
"strings"
"time"

"github.com/awslabs/operatorpkg/object"
Expand Down Expand Up @@ -272,6 +271,60 @@ var _ = Describe("NodeClass Validation Status Controller", func() {
}, fake.MaxCalls(1))
}, nodeclass.ConditionReasonCreateLaunchTemplateAuthFailed),
)
Context("Windows AMI Validation", func() {
DescribeTable(
"should fallback to static instance types when windows ami is used",
func(family string, terms []v1.AMISelectorTerm, expectedAMIID string) {
// Skip Windows 2025 on versions < 1.35
if family == v1.AMIFamilyWindows2025 && version.MustParseGeneric(awsEnv.VersionProvider.Get(ctx)).Minor() < 35 {
Skip("Windows 2025 requires EKS version 1.35+, current version: " + awsEnv.VersionProvider.Get(ctx))
}
nodeClass.Spec.AMIFamily = lo.ToPtr(family)
nodeClass.Spec.AMISelectorTerms = terms
ExpectApplied(ctx, env.Client, nodeClass)
ExpectObjectReconciled(ctx, env.Client, controller, nodeClass)
runInstancesInput := awsEnv.EC2API.RunInstancesBehavior.CalledWithInput.Pop()
Expect(runInstancesInput.InstanceType).To(Equal(ec2types.InstanceTypeM5Large))
},
Entry("Windows2019 with m5.large", v1.AMIFamilyWindows2019, []v1.AMISelectorTerm{{Alias: "windows2019@latest"}}, "amd64-ami-id"),
Entry("Windows2022 with m5.large", v1.AMIFamilyWindows2022, []v1.AMISelectorTerm{{Alias: "windows2022@latest"}}, "amd64-ami-id"),
Entry("Windows2025 with m6g.large", v1.AMIFamilyWindows2025, []v1.AMISelectorTerm{{Alias: "windows2025@latest"}}, "arm64-ami-id"),
)
})
Context("NodeClass Filtering", func() {
It("should succeed validation using fallback instance types when NodePool requirements are incompatible with NodeClass AMI", func() {
// AL2023 AMI Family is not compatible with a1 instance family
nodeClass.Spec.AMIFamily = lo.ToPtr(v1.AMIFamilyAL2023)
nodeClass.Spec.AMISelectorTerms = []v1.AMISelectorTerm{{Alias: "al2023@latest"}}

nodePool := coretest.NodePool(karpv1.NodePool{Spec: karpv1.NodePoolSpec{Template: karpv1.NodeClaimTemplate{
Spec: karpv1.NodeClaimTemplateSpec{
Requirements: []karpv1.NodeSelectorRequirementWithMinValues{
{
Key: v1.LabelInstanceFamily,
Operator: corev1.NodeSelectorOpIn,
Values: []string{"a1"},
},
},
NodeClassRef: &karpv1.NodeClassReference{
Group: object.GVK(nodeClass).Group,
Kind: object.GVK(nodeClass).Kind,
Name: nodeClass.Name,
},
},
}}})

ExpectApplied(ctx, env.Client, nodePool, nodeClass)
ExpectObjectReconciled(ctx, env.Client, controller, nodeClass)

nodeClass = ExpectExists(ctx, env.Client, nodeClass)
Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeValidationSucceeded).IsTrue()).To(BeTrue())

// Verify a fallback instance type is used for valdiation; not a1
runInstancesInput := awsEnv.EC2API.RunInstancesBehavior.CalledWithInput.Pop()
Expect(string(runInstancesInput.InstanceType)).ToNot(HavePrefix("a1"))
})
})
Context("Instance Type Prioritization Validation", func() {
BeforeEach(func() {
awsEnv.EC2API.DescribeImagesOutput.Set(&ec2.DescribeImagesOutput{
Expand Down Expand Up @@ -308,26 +361,6 @@ var _ = Describe("NodeClass Validation Status Controller", func() {
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed())
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed())
})
DescribeTable(
"should fallback to static instance types when no linked NodePools exist",
func(expectedInstanceType ec2types.InstanceType, expectedAMIID string) {
// Filter out the non-target standard AMI to ensure the right instance is selected, but leave the nvidia AMI to
// test AMI selection.
awsEnv.SSMAPI.Parameters = lo.PickBy(awsEnv.SSMAPI.Parameters, func(_, amiID string) bool {
return amiID == expectedAMIID || strings.Contains(amiID, "nvidia")
})
Expect(len(awsEnv.SSMAPI.Parameters)).To(BeNumerically(">", 1))

ExpectApplied(ctx, env.Client, nodeClass)
ExpectObjectReconciled(ctx, env.Client, controller, nodeClass)
launchTemplateInput := awsEnv.EC2API.CreateLaunchTemplateBehavior.CalledWithInput.Pop()
runInstancesInput := awsEnv.EC2API.RunInstancesBehavior.CalledWithInput.Pop()
Expect(runInstancesInput.InstanceType).To(Equal(expectedInstanceType))
Expect(launchTemplateInput.LaunchTemplateData.ImageId).To(PointTo(Equal(expectedAMIID)))
},
Entry("m5.large", ec2types.InstanceTypeM5Large, "amd64-ami-id"),
Entry("m6g.large", ec2types.InstanceTypeM6gLarge, "arm64-ami-id"),
)
It("should prioritize non-GPU instances", func() {
nodePool := coretest.NodePool(karpv1.NodePool{Spec: karpv1.NodePoolSpec{Template: karpv1.NodeClaimTemplate{
Spec: karpv1.NodeClaimTemplateSpec{
Expand Down
60 changes: 60 additions & 0 deletions pkg/providers/instancetype/compatibility/compatibility.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package compatibility

import (
"strings"

ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"

v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1"
)

type NodeClass interface {
AMIFamily() string
}

type CompatibleCheck interface {
compatibleCheck(info ec2types.InstanceTypeInfo) bool
}

func IsCompatibleWithNodeClass(info ec2types.InstanceTypeInfo, nodeClass NodeClass) bool {
for _, check := range []CompatibleCheck{
amiFamilyCompatibility(nodeClass.AMIFamily()),
} {
if !check.compatibleCheck(info) {
return false
}
}
return true
}

type amiFamilyCheck struct {
amiFamily string
}

func amiFamilyCompatibility(amiFamily string) CompatibleCheck {
return &amiFamilyCheck{
amiFamily: amiFamily,
}
}

func (c amiFamilyCheck) compatibleCheck(info ec2types.InstanceTypeInfo) bool {
// a1 instance types are not supported with al2023s (https://docs.aws.amazon.com/linux/al2023/ug/system-requirements.html)
if c.amiFamily == v1.AMIFamilyAL2023 && strings.HasPrefix(string(info.InstanceType), "a1.") {
return false
}
return true
}
Loading
Loading