diff --git a/examples/44-node-repair.yaml b/examples/44-node-repair.yaml new file mode 100644 index 0000000000..ddbe861a6f --- /dev/null +++ b/examples/44-node-repair.yaml @@ -0,0 +1,13 @@ +# An example ClusterConfig that uses a managed node group with auto repair. + +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: cluster-44 + region: us-west-2 + +managedNodeGroups: +- name: ng-1 + nodeRepairConfig: + enabled: true diff --git a/go.mod b/go.mod index 25bdfafb31..984266e1a8 100644 --- a/go.mod +++ b/go.mod @@ -16,7 +16,7 @@ require ( github.com/aws/aws-sdk-go-v2/credentials v1.17.11 github.com/aws/aws-sdk-go-v2/service/autoscaling v1.51.1 github.com/aws/aws-sdk-go-v2/service/cloudformation v1.56.1 - github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.46.2 + github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.46.3 github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.45.0 github.com/aws/aws-sdk-go-v2/service/cognitoidentityprovider v1.36.3 github.com/aws/aws-sdk-go-v2/service/ec2 v1.166.0 diff --git a/go.sum b/go.sum index 643001319b..e77165d7e5 100644 --- a/go.sum +++ b/go.sum @@ -740,6 +740,8 @@ github.com/aws/aws-sdk-go-v2/service/cloudformation v1.56.1 h1:EqRhsrEoXFFyzcNuq github.com/aws/aws-sdk-go-v2/service/cloudformation v1.56.1/go.mod h1:75rrfzgrN4Ol0m9Xo4+8S09KBoGAd1t6eafFHMt5wDI= github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.46.2 h1:DrN2vg75JseLCepYjMVav43e+v7+AhArtWlm2F0OJ6Y= github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.46.2/go.mod h1:WcTfALKgqv+VCMRCLtG4155sAwcfdYhFADc/yDJgSlc= +github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.46.3 h1:DfrEQMWCfk0wkuv/r0zwcGoykCuYWCLoGolbax6O3sw= +github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.46.3/go.mod h1:WcTfALKgqv+VCMRCLtG4155sAwcfdYhFADc/yDJgSlc= github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.45.0 h1:j9rGKWaYglZpf9KbJCQVM/L85Y4UdGMgK80A1OddR24= github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.45.0/go.mod h1:LZafBHU62ByizrdhNLMnzWGsUX+abAW4q35PN+FOj+A= github.com/aws/aws-sdk-go-v2/service/cognitoidentityprovider v1.36.3 h1:JNWpkjImTP2e308bv7ihfwgOawf640BY/pyZWrBb9rw= diff --git a/goformation/cloudformation/eks/aws-eks-nodegroup.go b/goformation/cloudformation/eks/aws-eks-nodegroup.go index 4e06e8349f..1cb6fd9e1d 100644 --- a/goformation/cloudformation/eks/aws-eks-nodegroup.go +++ b/goformation/cloudformation/eks/aws-eks-nodegroup.go @@ -54,6 +54,11 @@ type Nodegroup struct { // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-eks-nodegroup.html#cfn-eks-nodegroup-launchtemplate LaunchTemplate *Nodegroup_LaunchTemplateSpecification `json:"LaunchTemplate,omitempty"` + // NodeRepairConfig AWS CloudFormation Property + // Required: false + // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-eks-nodegroup.html#cfn-eks-nodegroup-noderepairconfig + NodeRepairConfig *Nodegroup_NodeRepairConfig `json:"NodeRepairConfig,omitempty"` + // NodeRole AWS CloudFormation Property // Required: true // See: http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-eks-nodegroup.html#cfn-eks-nodegroup-noderole diff --git a/goformation/cloudformation/eks/aws-eks-nodegroup_noderepairconfig.go b/goformation/cloudformation/eks/aws-eks-nodegroup_noderepairconfig.go new file mode 100644 index 0000000000..edf17f899d --- /dev/null +++ b/goformation/cloudformation/eks/aws-eks-nodegroup_noderepairconfig.go @@ -0,0 +1,32 @@ +package eks + +import ( + "goformation/v4/cloudformation/types" + + "goformation/v4/cloudformation/policies" +) + +// Nodegroup_NodeRepairConfig AWS CloudFormation Resource (AWS::EKS::Nodegroup.NodeRepairConfig) +type Nodegroup_NodeRepairConfig struct { + Enabled *types.Value `json:"Enabled,omitempty"` + + // AWSCloudFormationDeletionPolicy represents a CloudFormation DeletionPolicy + AWSCloudFormationDeletionPolicy policies.DeletionPolicy `json:"-"` + + // AWSCloudFormationUpdateReplacePolicy represents a CloudFormation UpdateReplacePolicy + AWSCloudFormationUpdateReplacePolicy policies.UpdateReplacePolicy `json:"-"` + + // AWSCloudFormationDependsOn stores the logical ID of the resources to be created before this resource + AWSCloudFormationDependsOn []string `json:"-"` + + // AWSCloudFormationMetadata stores structured data associated with this resource + AWSCloudFormationMetadata map[string]interface{} `json:"-"` + + // AWSCloudFormationCondition stores the logical ID of the condition that must be satisfied for this resource to be created + AWSCloudFormationCondition string `json:"-"` +} + +// AWSCloudFormationType returns the AWS CloudFormation resource type +func (r *Nodegroup_NodeRepairConfig) AWSCloudFormationType() string { + return "AWS::EKS::Nodegroup.NodeRepairConfig" +} diff --git a/pkg/apis/eksctl.io/v1alpha5/assets/schema.json b/pkg/apis/eksctl.io/v1alpha5/assets/schema.json index 2e720d60fb..a36295d198 100755 --- a/pkg/apis/eksctl.io/v1alpha5/assets/schema.json +++ b/pkg/apis/eksctl.io/v1alpha5/assets/schema.json @@ -1469,6 +1469,11 @@ "name": { "type": "string" }, + "nodeRepairConfig": { + "$ref": "#/definitions/NodeGroupNodeRepairConfig", + "description": "configures the auto repair feature of the nodegroup", + "x-intellij-html-description": "configures the auto repair feature of the nodegroup" + }, "outpostARN": { "type": "string", "description": "specifies the Outpost ARN in which the nodegroup should be created.", @@ -1633,7 +1638,8 @@ "taints", "updateConfig", "launchTemplate", - "releaseVersion" + "releaseVersion", + "nodeRepairConfig" ], "additionalProperties": false, "description": "represents an EKS-managed nodegroup", @@ -2205,6 +2211,21 @@ "description": "holds the configuration for [spot instances](/usage/spot-instances/)", "x-intellij-html-description": "holds the configuration for spot instances" }, + "NodeGroupNodeRepairConfig": { + "properties": { + "enabled": { + "type": "boolean", + "description": "Enables the auto repair feature for the nodegroup", + "x-intellij-html-description": "Enables the auto repair feature for the nodegroup" + } + }, + "preferredOrder": [ + "enabled" + ], + "additionalProperties": false, + "description": "contains the auto repair configuration for the nodegroup", + "x-intellij-html-description": "contains the auto repair configuration for the nodegroup" + }, "NodeGroupSGs": { "properties": { "attachIDs": { diff --git a/pkg/apis/eksctl.io/v1alpha5/types.go b/pkg/apis/eksctl.io/v1alpha5/types.go index 758d3907e0..d88fd7f22d 100644 --- a/pkg/apis/eksctl.io/v1alpha5/types.go +++ b/pkg/apis/eksctl.io/v1alpha5/types.go @@ -1596,6 +1596,13 @@ type ( // +optional MaxUnavailablePercentage *int `json:"maxUnavailablePercentage,omitempty"` } + + // NodeGroupNodeRepairConfig contains the auto repair configuration for the nodegroup + NodeGroupNodeRepairConfig struct { + // Enables the auto repair feature for the nodegroup + // +optional + Enabled *bool `json:"enabled,omitempty"` + } ) // MetricsCollection used by the scaling config, @@ -1883,6 +1890,10 @@ type ManagedNodeGroup struct { // ReleaseVersion the AMI version of the EKS optimized AMI to use ReleaseVersion string `json:"releaseVersion"` + // NodeRepairConfig configures the auto repair feature of the nodegroup + // +optional + NodeRepairConfig *NodeGroupNodeRepairConfig `json:"nodeRepairConfig,omitempty"` + // Internal fields Unowned bool `json:"-"` diff --git a/pkg/cfn/builder/managed_launch_template_test.go b/pkg/cfn/builder/managed_launch_template_test.go index da496486b4..acc5e2f30d 100644 --- a/pkg/cfn/builder/managed_launch_template_test.go +++ b/pkg/cfn/builder/managed_launch_template_test.go @@ -266,6 +266,19 @@ API_SERVER_URL=https://test.com resourcesFilename: "spot.json", }), + Entry("With node repair enabled", &mngCase{ + ng: &api.ManagedNodeGroup{ + NodeGroupBase: &api.NodeGroupBase{ + Name: "node-repair-enabled", + InstanceType: "m5.xlarge", + }, + NodeRepairConfig: &api.NodeGroupNodeRepairConfig{ + Enabled: aws.Bool(true), + }, + }, + resourcesFilename: "node-repair-enabled.json", + }), + Entry("Without instance type set in the launch template", &mngCase{ ng: &api.ManagedNodeGroup{ NodeGroupBase: &api.NodeGroupBase{ diff --git a/pkg/cfn/builder/managed_nodegroup.go b/pkg/cfn/builder/managed_nodegroup.go index 3bbd438246..5b82811601 100644 --- a/pkg/cfn/builder/managed_nodegroup.go +++ b/pkg/cfn/builder/managed_nodegroup.go @@ -117,6 +117,14 @@ func (m *ManagedNodeGroupResourceSet) AddAllResources(ctx context.Context) error managedResource.UpdateConfig = updateConfig } + if m.nodeGroup.NodeRepairConfig != nil { + nodeRepairConfig := &gfneks.Nodegroup_NodeRepairConfig{} + if m.nodeGroup.NodeRepairConfig.Enabled != nil { + nodeRepairConfig.Enabled = gfnt.NewBoolean(*m.nodeGroup.NodeRepairConfig.Enabled) + } + managedResource.NodeRepairConfig = nodeRepairConfig + } + if m.nodeGroup.Spot { // TODO use constant from SDK managedResource.CapacityType = gfnt.NewString("SPOT") diff --git a/pkg/cfn/builder/testdata/launch_template/node-repair-enabled.json b/pkg/cfn/builder/testdata/launch_template/node-repair-enabled.json new file mode 100644 index 0000000000..83578dadd4 --- /dev/null +++ b/pkg/cfn/builder/testdata/launch_template/node-repair-enabled.json @@ -0,0 +1,176 @@ +{ + "LaunchTemplate": { + "Type": "AWS::EC2::LaunchTemplate", + "Properties": { + "LaunchTemplateData": { + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/xvda", + "Ebs": { + "Iops": 3000, + "Throughput": 125, + "VolumeSize": 80, + "VolumeType": "gp3" + } + } + ], + "MetadataOptions": { + "HttpPutResponseHopLimit": 2, + "HttpTokens": "required" + }, + "SecurityGroupIds": [ + { + "Fn::ImportValue": "eksctl-lt::ClusterSecurityGroupId" + } + ], + "TagSpecifications": [ + { + "ResourceType": "instance", + "Tags": [ + { + "Key": "Name", + "Value": "lt-node-repair-enabled-Node" + }, + { + "Key": "alpha.eksctl.io/nodegroup-name", + "Value": "node-repair-enabled" + }, + { + "Key": "alpha.eksctl.io/nodegroup-type", + "Value": "managed" + } + ] + }, + { + "ResourceType": "volume", + "Tags": [ + { + "Key": "Name", + "Value": "lt-node-repair-enabled-Node" + }, + { + "Key": "alpha.eksctl.io/nodegroup-name", + "Value": "node-repair-enabled" + }, + { + "Key": "alpha.eksctl.io/nodegroup-type", + "Value": "managed" + } + ] + }, + { + "ResourceType": "network-interface", + "Tags": [ + { + "Key": "Name", + "Value": "lt-node-repair-enabled-Node" + }, + { + "Key": "alpha.eksctl.io/nodegroup-name", + "Value": "node-repair-enabled" + }, + { + "Key": "alpha.eksctl.io/nodegroup-type", + "Value": "managed" + } + ] + } + ] + }, + "LaunchTemplateName": { + "Fn::Sub": "${AWS::StackName}" + } + } + }, + "ManagedNodeGroup": { + "Type": "AWS::EKS::Nodegroup", + "Properties": { + "AmiType": "AL2023_x86_64_STANDARD", + "ClusterName": "lt", + "Labels": { + "alpha.eksctl.io/cluster-name": "lt", + "alpha.eksctl.io/nodegroup-name": "node-repair-enabled" + }, + "InstanceTypes": ["m5.xlarge"], + "NodeRole": { + "Fn::GetAtt": [ + "NodeInstanceRole", + "Arn" + ] + }, + "NodegroupName": "node-repair-enabled", + "ScalingConfig": { + "DesiredSize": 2, + "MaxSize": 2, + "MinSize": 2 + }, + "Subnets": [ + "subnet-public-us-west-2a" + ], + "Tags": { + "alpha.eksctl.io/nodegroup-name": "node-repair-enabled", + "alpha.eksctl.io/nodegroup-type": "managed" + }, + "LaunchTemplate": { + "Id": { + "Ref": "LaunchTemplate" + } + }, + "NodeRepairConfig": { + "Enabled": true + } + } + }, + "NodeInstanceRole": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Statement": [ + { + "Action": [ + "sts:AssumeRole" + ], + "Effect": "Allow", + "Principal": { + "Service": [ + { + "Fn::FindInMap": [ + "ServicePrincipalPartitionMap", + { + "Ref": "AWS::Partition" + }, + "EC2" + ] + } + ] + } + } + ], + "Version": "2012-10-17" + }, + "ManagedPolicyArns": [ + { + "Fn::Sub": "arn:${AWS::Partition}:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" + }, + { + "Fn::Sub": "arn:${AWS::Partition}:iam::aws:policy/AmazonEKSWorkerNodePolicy" + }, + { + "Fn::Sub": "arn:${AWS::Partition}:iam::aws:policy/AmazonEKS_CNI_Policy" + }, + { + "Fn::Sub": "arn:${AWS::Partition}:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + ], + "Path": "/", + "Tags": [ + { + "Key": "Name", + "Value": { + "Fn::Sub": "${AWS::StackName}/NodeInstanceRole" + } + } + ] + } + } +} diff --git a/pkg/ctl/cmdutils/configfile.go b/pkg/ctl/cmdutils/configfile.go index 29d7e76294..087313a590 100644 --- a/pkg/ctl/cmdutils/configfile.go +++ b/pkg/ctl/cmdutils/configfile.go @@ -71,6 +71,7 @@ var ( commonNGFlagsIncompatibleWithConfigFile = []string{ "managed", "spot", + "enable-node-repair", "instance-types", "nodes", "nodes-min", @@ -605,11 +606,17 @@ func makeManagedNodegroup(nodeGroup *api.NodeGroup, options CreateManagedNGOptio AttachIDs: ngBase.SecurityGroups.AttachIDs, } } - return &api.ManagedNodeGroup{ + mng := &api.ManagedNodeGroup{ NodeGroupBase: &ngBase, Spot: options.Spot, InstanceTypes: options.InstanceTypes, } + if options.NodeRepairEnabled { + mng.NodeRepairConfig = &api.NodeGroupNodeRepairConfig{ + Enabled: &options.NodeRepairEnabled, + } + } + return mng } func validateUnsupportedCLIFeatures(ng *api.ManagedNodeGroup) error { @@ -620,7 +627,7 @@ func validateManagedNGFlags(cmd *cobra.Command, managed bool) error { if managed { return nil } - flagsValidOnlyWithMNG := []string{"spot", "instance-types"} + flagsValidOnlyWithMNG := []string{"spot", "enable-node-repair", "instance-types"} if flagName, found := findChangedFlag(cmd, flagsValidOnlyWithMNG); found { return errors.Errorf("--%s is only valid with managed nodegroups (--managed)", flagName) } diff --git a/pkg/ctl/cmdutils/create_cluster.go b/pkg/ctl/cmdutils/create_cluster.go index e000bd7de8..31201c0d23 100644 --- a/pkg/ctl/cmdutils/create_cluster.go +++ b/pkg/ctl/cmdutils/create_cluster.go @@ -39,9 +39,10 @@ type NodeGroupOptions struct { // CreateManagedNGOptions holds options for creating a managed nodegroup type CreateManagedNGOptions struct { - Managed bool - Spot bool - InstanceTypes []string + Managed bool + Spot bool + NodeRepairEnabled bool + InstanceTypes []string } // CreateNGOptions holds options for creating a nodegroup diff --git a/pkg/ctl/cmdutils/nodegroup_flags.go b/pkg/ctl/cmdutils/nodegroup_flags.go index 015765ab86..50f4c751ea 100644 --- a/pkg/ctl/cmdutils/nodegroup_flags.go +++ b/pkg/ctl/cmdutils/nodegroup_flags.go @@ -56,6 +56,7 @@ func AddCommonCreateNodeGroupFlags(fs *pflag.FlagSet, cmd *Cmd, ng *api.NodeGrou fs.BoolVarP(&mngOptions.Managed, "managed", "", true, "Create EKS-managed nodegroup") fs.BoolVar(&mngOptions.Spot, "spot", false, "Create a spot nodegroup (managed nodegroups only)") + fs.BoolVar(&mngOptions.NodeRepairEnabled, "enable-node-repair", false, "Enable automatic node repair (managed nodegroups only)") fs.StringSliceVar(&mngOptions.InstanceTypes, "instance-types", nil, "Comma-separated list of instance types (e.g., --instance-types=c3.large,c4.large,c5.large") } diff --git a/pkg/ctl/create/nodegroup_test.go b/pkg/ctl/create/nodegroup_test.go index b40c3cd490..23f11a8ba2 100644 --- a/pkg/ctl/create/nodegroup_test.go +++ b/pkg/ctl/create/nodegroup_test.go @@ -85,6 +85,10 @@ var _ = Describe("create nodegroup", func() { args: []string{"--cluster", "foo", "--spot"}, error: "--spot is only valid with managed nodegroups (--managed)", }), + Entry("with enable-node-repair flag", invalidParamsCase{ + args: []string{"--cluster", "foo", "--enable-node-repair"}, + error: "--enable-node-repair is only valid with managed nodegroups (--managed)", + }), Entry("with instance-types flag", invalidParamsCase{ args: []string{"--cluster", "foo", "--instance-types", "some-type"}, error: "--instance-types is only valid with managed nodegroups (--managed)", diff --git a/userdocs/mkdocs.yml b/userdocs/mkdocs.yml index dd60b40596..85c4e97ac1 100644 --- a/userdocs/mkdocs.yml +++ b/userdocs/mkdocs.yml @@ -177,6 +177,7 @@ nav: - usage/windows-worker-nodes.md - usage/nodegroup-additional-volume-mappings.md - usage/hybrid-nodes.md + - usage/nodegroup-node-repair-config.md - usage/eksctl-karpenter.md - usage/eksctl-anywhere.md - GitOps: diff --git a/userdocs/src/usage/nodegroup-node-repair-config.md b/userdocs/src/usage/nodegroup-node-repair-config.md new file mode 100644 index 0000000000..f42ff4b434 --- /dev/null +++ b/userdocs/src/usage/nodegroup-node-repair-config.md @@ -0,0 +1,47 @@ +# Support for Node Repair Config in EKS Managed Nodegroups + +EKS Managed Nodegroups now supports Node Repair, where the health of managed nodes are monitored, +and unhealthy worker nodes are replaced or rebooted in response. + +## Creating a cluster a managed nodegroup with node repair enabled + +To create a cluster with a managed nodegroup using node repair, pass the `--enable-node-repair` flag: + +```shell +$ eksctl create cluster --enable-node-repair +``` + +To create a managed nodegroup using node repair on an existing cluster: + +```shell +$ eksctl create nodegroup --cluster= --enable-node-repair +``` + +To create a cluster with a managed nodegroup using node repair via a config file: + +```yaml +# node-repair-nodegroup-cluster.yaml +--- +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: cluster-44 + region: us-west-2 + +managedNodeGroups: +- name: ng-1 + nodeRepairConfig: + enabled: true + +``` + +```shell +$ eksctl create cluster -f node-repair-nodegroup-cluster.yaml +``` + +## Further information + +- [EKS Managed Nodegroup Node Health][eks-user-guide] + +[eks-user-guide]: https://docs.aws.amazon.com/eks/latest/userguide/node-health.html