Skip to content

Commit fca7f5a

Browse files
authored
fix: iterate over all subnets in RunInstances dry-run validation (#9044)
1 parent c2bbf02 commit fca7f5a

File tree

2 files changed

+42
-21
lines changed

2 files changed

+42
-21
lines changed

pkg/controllers/nodeclass/validation.go

Lines changed: 32 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -273,26 +273,38 @@ func (v *Validation) validateRunInstancesAuthorization(
273273
tags map[string]string,
274274
launchTemplate *launchtemplate.LaunchTemplate,
275275
) (result reconcile.Result, err error) {
276-
runInstancesInput := getRunInstancesInput(nodeClass, tags, launchTemplate)
277-
// Adding NopRetryer to avoid aggressive retry when rate limited
278-
if _, err = v.ec2api.RunInstances(ctx, runInstancesInput, func(o *ec2.Options) {
279-
o.Retryer = aws.NopRetryer{}
280-
}); awserrors.IgnoreDryRunError(err) != nil {
281-
// If we get InstanceProfile NotFound, but we have a resolved instance profile in the status,
282-
// this means there is most likely an eventual consistency issue and we just need to requeue
283-
if awserrors.IsInstanceProfileNotFound(err) || awserrors.IsRateLimitedError(err) || awserrors.IsServerError(err) {
284-
return reconcile.Result{Requeue: true}, nil
285-
}
286-
if awserrors.IgnoreUnauthorizedOperationError(err) != nil {
287-
// Dry run should only ever return UnauthorizedOperation or DryRunOperation so if we receive any other error
288-
// it would be an unexpected state
289-
return reconcile.Result{}, fmt.Errorf("validating ec2:RunInstances authorization, %w", err)
276+
// We use the first subnet's error to determine the outcome. Mixed-error scenarios across subnets
277+
// are unlikely in practice since authorization policies are not subnet-specific, and transient
278+
// failures on individual subnets are already handled by the early-exit-on-success pattern.
279+
var firstSubnetErr error
280+
for i, subnet := range nodeClass.Status.Subnets {
281+
runInstancesInput := getRunInstancesInput(tags, launchTemplate, &subnet)
282+
if _, err = v.ec2api.RunInstances(ctx, runInstancesInput, func(o *ec2.Options) {
283+
// Adding NopRetryer to avoid aggressive retry when rate limited
284+
o.Retryer = aws.NopRetryer{}
285+
}); awserrors.IgnoreDryRunError(err) != nil {
286+
if i == 0 {
287+
firstSubnetErr = err
288+
}
289+
} else {
290+
// if any of them succeed, we can exit early
291+
return reconcile.Result{}, nil
290292
}
291-
log.FromContext(ctx).Error(err, "unauthorized to call ec2:RunInstances")
292-
v.updateCacheOnFailure(nodeClass, tags, ConditionReasonRunInstancesAuthFailed)
293-
return reconcile.Result{RequeueAfter: requeueAfterTime}, nil
294293
}
295-
return reconcile.Result{}, nil
294+
295+
// If we get InstanceProfile NotFound, but we have a resolved instance profile in the status,
296+
// this means there is most likely an eventual consistency issue and we just need to requeue
297+
if awserrors.IsInstanceProfileNotFound(firstSubnetErr) || awserrors.IsRateLimitedError(firstSubnetErr) || awserrors.IsServerError(firstSubnetErr) {
298+
return reconcile.Result{Requeue: true}, nil
299+
}
300+
if awserrors.IgnoreUnauthorizedOperationError(firstSubnetErr) != nil {
301+
// Dry run should only ever return UnauthorizedOperation or DryRunOperation so if we receive any other error
302+
// it would be an unexpected state
303+
return reconcile.Result{}, fmt.Errorf("validating ec2:RunInstances authorization, %w", firstSubnetErr)
304+
}
305+
log.FromContext(ctx).Error(firstSubnetErr, "unauthorized to call ec2:RunInstances")
306+
v.updateCacheOnFailure(nodeClass, tags, ConditionReasonRunInstancesAuthFailed)
307+
return reconcile.Result{RequeueAfter: requeueAfterTime}, nil
296308
}
297309

298310
func (*Validation) requiredConditions() []string {
@@ -336,9 +348,9 @@ func (v *Validation) clearCacheEntries(nodeClass *v1.EC2NodeClass) {
336348
}
337349

338350
func getRunInstancesInput(
339-
nodeClass *v1.EC2NodeClass,
340351
tags map[string]string,
341352
launchTemplate *launchtemplate.LaunchTemplate,
353+
subnet *v1.Subnet,
342354
) *ec2.RunInstancesInput {
343355
return &ec2.RunInstancesInput{
344356
DryRun: lo.ToPtr(true),
@@ -352,7 +364,7 @@ func getRunInstancesInput(
352364
NetworkInterfaces: []ec2types.InstanceNetworkInterfaceSpecification{
353365
{
354366
DeviceIndex: lo.ToPtr[int32](0),
355-
SubnetId: lo.ToPtr(nodeClass.Status.Subnets[0].ID),
367+
SubnetId: lo.ToPtr(subnet.ID),
356368
},
357369
},
358370
TagSpecifications: []ec2types.TagSpecification{

pkg/controllers/nodeclass/validation_test.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,14 +263,23 @@ var _ = Describe("NodeClass Validation Status Controller", func() {
263263
Entry("should update status condition as NotReady when RunInstances unauthorized", func() {
264264
awsEnv.EC2API.RunInstancesBehavior.Error.Set(&smithy.GenericAPIError{
265265
Code: "UnauthorizedOperation",
266-
}, fake.MaxCalls(1))
266+
}, fake.MaxCalls(4))
267267
}, nodeclass.ConditionReasonRunInstancesAuthFailed),
268268
Entry("should update status condition as NotReady when CreateLaunchTemplate unauthorized", func() {
269269
awsEnv.EC2API.CreateLaunchTemplateBehavior.Error.Set(&smithy.GenericAPIError{
270270
Code: "UnauthorizedOperation",
271271
}, fake.MaxCalls(1))
272272
}, nodeclass.ConditionReasonCreateLaunchTemplateAuthFailed),
273273
)
274+
It("should succeed RunInstances validation when first subnet returns 500 but another subnet succeeds", func() {
275+
// Fail the first RunInstances call (first subnet) with a server error,
276+
// then let subsequent calls (remaining subnets) succeed via the default dry-run path
277+
awsEnv.EC2API.RunInstancesBehavior.Error.Set(fmt.Errorf("InternalError"), fake.MaxCalls(1))
278+
ExpectApplied(ctx, env.Client, nodeClass)
279+
ExpectObjectReconciled(ctx, env.Client, controller, nodeClass)
280+
nodeClass = ExpectExists(ctx, env.Client, nodeClass)
281+
Expect(nodeClass.StatusConditions().Get(v1.ConditionTypeValidationSucceeded).IsTrue()).To(BeTrue())
282+
})
274283
Context("Windows AMI Validation", func() {
275284
DescribeTable(
276285
"should fallback to static instance types when windows ami is used",

0 commit comments

Comments
 (0)