Skip to content

Commit d3b7986

Browse files
committed
feat: expose AMI cache TTL as runtime flags
Operators running large fleets can generate significant DescribeImages API call volume due to frequent AMI reconciles. This change makes the AMI cache TTL configurable so operators can tune them for their workload without rebuilding. --ami-cache-ttl (env: AMI_CACHE_TTL, default: 1m) Default preserve existing behaviour.
1 parent ef943b1 commit d3b7986

File tree

9 files changed

+35
-4
lines changed

9 files changed

+35
-4
lines changed

kwok/operator/operator.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ func NewOperator(ctx context.Context, operator *operator.Operator) (context.Cont
144144
// the previously resolved value will be used.
145145
lo.Must0(versionProvider.UpdateVersion(ctx))
146146
ssmProvider := ssmp.NewDefaultProvider(ssm.NewFromConfig(cfg), ssmCache)
147-
amiProvider := amifamily.NewDefaultProvider(operator.Clock, versionProvider, ssmProvider, ec2api, cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval))
147+
amiProvider := amifamily.NewDefaultProvider(operator.Clock, versionProvider, ssmProvider, ec2api, cache.New(options.FromContext(ctx).AMICacheTTL, awscache.DefaultCleanupInterval))
148148
amiResolver := amifamily.NewDefaultResolver(cfg.Region)
149149
launchTemplateProvider := launchtemplate.NewDefaultProvider(
150150
ctx,

pkg/cache/cache.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@ const (
2424
// AWS APIs, which can have a serious impact on performance and scalability.
2525
// DO NOT CHANGE THIS VALUE WITHOUT DUE CONSIDERATION
2626
DefaultTTL = time.Minute
27+
// AMICacheTTL is the default TTL for cached AMI discovery results. Operators
28+
// can override this at runtime via the --ami-cache-ttl flag. Setting the cache
29+
// TTL >= the requeue interval ensures scheduled reconciles are served from cache
30+
// rather than re-querying the EC2 API on every reconcile.
31+
AMICacheTTL = time.Minute
2732
// UnavailableOfferingsTTL is the time before offerings that were marked as unavailable
2833
// are removed from the cache and are available for launch again
2934
UnavailableOfferingsTTL = 3 * time.Minute

pkg/operator/operator.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ func NewOperator(ctx context.Context, operator *operator.Operator) (context.Cont
150150
// the previously resolved value will be used.
151151
lo.Must0(versionProvider.UpdateVersion(ctx))
152152
ssmProvider := ssmp.NewDefaultProvider(ssm.NewFromConfig(cfg), ssmCache)
153-
amiProvider := amifamily.NewDefaultProvider(operator.Clock, versionProvider, ssmProvider, ec2api, cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval))
153+
amiProvider := amifamily.NewDefaultProvider(operator.Clock, versionProvider, ssmProvider, ec2api, cache.New(options.FromContext(ctx).AMICacheTTL, awscache.DefaultCleanupInterval))
154154
amiResolver := amifamily.NewDefaultResolver(cfg.Region)
155155
launchTemplateProvider := launchtemplate.NewDefaultProvider(
156156
ctx,

pkg/operator/options/options.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"flag"
2121
"fmt"
2222
"os"
23+
"time"
2324

2425
coreoptions "sigs.k8s.io/karpenter/pkg/operator/options"
2526
"sigs.k8s.io/karpenter/pkg/utils/env"
@@ -43,6 +44,7 @@ type Options struct {
4344
InterruptionQueue string
4445
ReservedENIs int
4546
DisableDryRun bool
47+
AMICacheTTL time.Duration
4648
}
4749

4850
func (o *Options) AddFlags(fs *coreoptions.FlagSet) {
@@ -55,6 +57,7 @@ func (o *Options) AddFlags(fs *coreoptions.FlagSet) {
5557
fs.StringVar(&o.InterruptionQueue, "interruption-queue", env.WithDefaultString("INTERRUPTION_QUEUE", ""), "Interruption queue is the name of the SQS queue used for processing interruption events from EC2. Interruption handling is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.")
5658
fs.IntVar(&o.ReservedENIs, "reserved-enis", env.WithDefaultInt("RESERVED_ENIS", 0), "Reserved ENIs are not included in the calculations for max-pods or kube-reserved. This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html.")
5759
fs.BoolVarWithEnv(&o.DisableDryRun, "disable-dry-run", "DISABLE_DRY_RUN", false, "If true, then disable dry run validation for EC2NodeClasses.")
60+
fs.DurationVar(&o.AMICacheTTL, "ami-cache-ttl", env.WithDefaultDuration("AMI_CACHE_TTL", time.Minute), "TTL for cached AMI discovery results.")
5861
}
5962

6063
func (o *Options) Parse(fs *coreoptions.FlagSet, args ...string) error {

pkg/operator/options/options_validation.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ func (o *Options) Validate() error {
2828
o.validateVMMemoryOverheadPercent(),
2929
o.validateReservedENIs(),
3030
o.validateRequiredFields(),
31+
o.validateAMICacheTTL(),
3132
)
3233
}
3334

@@ -64,3 +65,11 @@ func (o *Options) validateRequiredFields() error {
6465
}
6566
return nil
6667
}
68+
69+
func (o *Options) validateAMICacheTTL() error {
70+
if o.AMICacheTTL <= 0 {
71+
return fmt.Errorf("ami-cache-ttl must be positive")
72+
}
73+
return nil
74+
}
75+

pkg/operator/options/suite_test.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"flag"
2020
"os"
2121
"testing"
22+
"time"
2223

2324
"github.com/samber/lo"
2425
coreoptions "sigs.k8s.io/karpenter/pkg/operator/options"
@@ -63,7 +64,8 @@ var _ = Describe("Options", func() {
6364
"--vm-memory-overhead-percent", "0.1",
6465
"--interruption-queue", "env-cluster",
6566
"--reserved-enis", "10",
66-
"--disable-dry-run")
67+
"--disable-dry-run",
68+
"--ami-cache-ttl", "15m")
6769
Expect(err).ToNot(HaveOccurred())
6870
expectOptionsEqual(opts, test.Options(test.OptionsFields{
6971
ClusterCABundle: lo.ToPtr("env-bundle"),
@@ -74,6 +76,7 @@ var _ = Describe("Options", func() {
7476
InterruptionQueue: lo.ToPtr("env-cluster"),
7577
ReservedENIs: lo.ToPtr(10),
7678
DisableDryRun: lo.ToPtr(true),
79+
AMICacheTTL: lo.ToPtr(15 * time.Minute),
7780
}))
7881
})
7982
It("should correctly fallback to env vars when CLI flags aren't set", func() {
@@ -85,6 +88,7 @@ var _ = Describe("Options", func() {
8588
os.Setenv("INTERRUPTION_QUEUE", "env-cluster")
8689
os.Setenv("RESERVED_ENIS", "10")
8790
os.Setenv("DISABLE_DRY_RUN", "false")
91+
os.Setenv("AMI_CACHE_TTL", "15m")
8892

8993
// Add flags after we set the environment variables so that the parsing logic correctly refers
9094
// to the new environment variable values
@@ -100,6 +104,7 @@ var _ = Describe("Options", func() {
100104
InterruptionQueue: lo.ToPtr("env-cluster"),
101105
ReservedENIs: lo.ToPtr(10),
102106
DisableDryRun: lo.ToPtr(false),
107+
AMICacheTTL: lo.ToPtr(15 * time.Minute),
103108
}))
104109
})
105110

@@ -123,6 +128,10 @@ var _ = Describe("Options", func() {
123128
err := opts.Parse(fs, "--cluster-name", "test-cluster", "--reserved-enis", "-1")
124129
Expect(err).To(HaveOccurred())
125130
})
131+
It("should fail when ami-cache-ttl is zero", func() {
132+
err := opts.Parse(fs, "--cluster-name", "test-cluster", "--ami-cache-ttl", "0")
133+
Expect(err).To(HaveOccurred())
134+
})
126135
})
127136
})
128137

@@ -136,4 +145,5 @@ func expectOptionsEqual(optsA *options.Options, optsB *options.Options) {
136145
Expect(optsA.InterruptionQueue).To(Equal(optsB.InterruptionQueue))
137146
Expect(optsA.ReservedENIs).To(Equal(optsB.ReservedENIs))
138147
Expect(optsA.DisableDryRun).To(Equal(optsB.DisableDryRun))
148+
Expect(optsA.AMICacheTTL).To(Equal(optsB.AMICacheTTL))
139149
}

pkg/test/environment.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ func NewEnvironment(ctx context.Context, env *coretest.Environment) *Environment
117117
iamapi := fake.NewIAMAPI()
118118

119119
// cache
120-
amiCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval)
120+
amiCache := cache.New(awscache.AMICacheTTL, awscache.DefaultCleanupInterval)
121121
ec2Cache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval)
122122
instanceTypeCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval)
123123
instanceCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval)

pkg/test/options.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package test
1616

1717
import (
1818
"fmt"
19+
"time"
1920

2021
"github.com/imdario/mergo"
2122
"github.com/samber/lo"
@@ -33,6 +34,7 @@ type OptionsFields struct {
3334
InterruptionQueue *string
3435
ReservedENIs *int
3536
DisableDryRun *bool
37+
AMICacheTTL *time.Duration
3638
}
3739

3840
func Options(overrides ...OptionsFields) *options.Options {
@@ -52,5 +54,6 @@ func Options(overrides ...OptionsFields) *options.Options {
5254
InterruptionQueue: lo.FromPtrOr(opts.InterruptionQueue, ""),
5355
ReservedENIs: lo.FromPtrOr(opts.ReservedENIs, 0),
5456
DisableDryRun: lo.FromPtrOr(opts.DisableDryRun, false),
57+
AMICacheTTL: lo.FromPtrOr(opts.AMICacheTTL, time.Minute),
5558
}
5659
}

website/content/en/preview/reference/settings.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Karpenter surfaces environment variables and CLI parameters to allow you to conf
1212

1313
| Environment Variable | CLI Flag | Description |
1414
|--|--|--|
15+
| AMI_CACHE_TTL | \-\-ami-cache-ttl | TTL for cached AMI discovery results. (default = 1m0s)|
1516
| BATCH_IDLE_DURATION | \-\-batch-idle-duration | The maximum amount of time with no new pending pods that if exceeded ends the current batching window. If pods arrive faster than this time, the batching window will be extended up to the maxDuration. If they arrive slower, the pods will be batched separately. (default = 1s)|
1617
| BATCH_MAX_DURATION | \-\-batch-max-duration | The maximum length of a batch window. The longer this is, the more pods we can consider for provisioning at one time which usually results in fewer but larger nodes. (default = 10s)|
1718
| CLUSTER_CA_BUNDLE | \-\-cluster-ca-bundle | Cluster CA bundle for nodes to use for TLS connections with the API server. If not set, this is taken from the controller's TLS configuration.|

0 commit comments

Comments
 (0)