feat: expose AMI cache TTL as runtime flags

chrisdoherty4 · chrisdoherty4 · commit d3b7986e536c · 2026-04-03T14:50:19.000-05:00
Operators running large fleets can generate significant DescribeImages
API call volume due to frequent AMI reconciles. This change makes the
AMI cache TTL configurable so operators can tune them for their workload
without rebuilding.

  --ami-cache-ttl        (env: AMI_CACHE_TTL,        default: 1m)

Default preserve existing behaviour.
diff --git a/kwok/operator/operator.go b/kwok/operator/operator.go
@@ -144,7 +144,7 @@ func NewOperator(ctx context.Context, operator *operator.Operator) (context.Cont
 	// the previously resolved value will be used.
 	lo.Must0(versionProvider.UpdateVersion(ctx))
 	ssmProvider := ssmp.NewDefaultProvider(ssm.NewFromConfig(cfg), ssmCache)
-	amiProvider := amifamily.NewDefaultProvider(operator.Clock, versionProvider, ssmProvider, ec2api, cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval))
+	amiProvider := amifamily.NewDefaultProvider(operator.Clock, versionProvider, ssmProvider, ec2api, cache.New(options.FromContext(ctx).AMICacheTTL, awscache.DefaultCleanupInterval))
 	amiResolver := amifamily.NewDefaultResolver(cfg.Region)
 	launchTemplateProvider := launchtemplate.NewDefaultProvider(
 		ctx,
diff --git a/pkg/cache/cache.go b/pkg/cache/cache.go
@@ -24,6 +24,11 @@ const (
 	// AWS APIs, which can have a serious impact on performance and scalability.
 	// DO NOT CHANGE THIS VALUE WITHOUT DUE CONSIDERATION
 	DefaultTTL = time.Minute
+	// AMICacheTTL is the default TTL for cached AMI discovery results. Operators
+	// can override this at runtime via the --ami-cache-ttl flag. Setting the cache
+	// TTL >= the requeue interval ensures scheduled reconciles are served from cache
+	// rather than re-querying the EC2 API on every reconcile.
+	AMICacheTTL = time.Minute
 	// UnavailableOfferingsTTL is the time before offerings that were marked as unavailable
 	// are removed from the cache and are available for launch again
 	UnavailableOfferingsTTL = 3 * time.Minute
diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go
@@ -150,7 +150,7 @@ func NewOperator(ctx context.Context, operator *operator.Operator) (context.Cont
 	// the previously resolved value will be used.
 	lo.Must0(versionProvider.UpdateVersion(ctx))
 	ssmProvider := ssmp.NewDefaultProvider(ssm.NewFromConfig(cfg), ssmCache)
-	amiProvider := amifamily.NewDefaultProvider(operator.Clock, versionProvider, ssmProvider, ec2api, cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval))
+	amiProvider := amifamily.NewDefaultProvider(operator.Clock, versionProvider, ssmProvider, ec2api, cache.New(options.FromContext(ctx).AMICacheTTL, awscache.DefaultCleanupInterval))
 	amiResolver := amifamily.NewDefaultResolver(cfg.Region)
 	launchTemplateProvider := launchtemplate.NewDefaultProvider(
 		ctx,
diff --git a/pkg/operator/options/options.go b/pkg/operator/options/options.go
@@ -20,6 +20,7 @@ import (
 	"flag"
 	"fmt"
 	"os"
+	"time"
 
 	coreoptions "sigs.k8s.io/karpenter/pkg/operator/options"
 	"sigs.k8s.io/karpenter/pkg/utils/env"
@@ -43,6 +44,7 @@ type Options struct {
 	InterruptionQueue       string
 	ReservedENIs            int
 	DisableDryRun           bool
+	AMICacheTTL             time.Duration
 }
 
 func (o *Options) AddFlags(fs *coreoptions.FlagSet) {
@@ -55,6 +57,7 @@ func (o *Options) AddFlags(fs *coreoptions.FlagSet) {
 	fs.StringVar(&o.InterruptionQueue, "interruption-queue", env.WithDefaultString("INTERRUPTION_QUEUE", ""), "Interruption queue is the name of the SQS queue used for processing interruption events from EC2. Interruption handling is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.")
 	fs.IntVar(&o.ReservedENIs, "reserved-enis", env.WithDefaultInt("RESERVED_ENIS", 0), "Reserved ENIs are not included in the calculations for max-pods or kube-reserved. This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html.")
 	fs.BoolVarWithEnv(&o.DisableDryRun, "disable-dry-run", "DISABLE_DRY_RUN", false, "If true, then disable dry run validation for EC2NodeClasses.")
+	fs.DurationVar(&o.AMICacheTTL, "ami-cache-ttl", env.WithDefaultDuration("AMI_CACHE_TTL", time.Minute), "TTL for cached AMI discovery results.")
 }
 
 func (o *Options) Parse(fs *coreoptions.FlagSet, args ...string) error {
diff --git a/pkg/operator/options/options_validation.go b/pkg/operator/options/options_validation.go
@@ -28,6 +28,7 @@ func (o *Options) Validate() error {
 		o.validateVMMemoryOverheadPercent(),
 		o.validateReservedENIs(),
 		o.validateRequiredFields(),
+		o.validateAMICacheTTL(),
 	)
 }
 
@@ -64,3 +65,11 @@ func (o *Options) validateRequiredFields() error {
 	}
 	return nil
 }
+
+func (o *Options) validateAMICacheTTL() error {
+	if o.AMICacheTTL <= 0 {
+		return fmt.Errorf("ami-cache-ttl must be positive")
+	}
+	return nil
+}
+
diff --git a/pkg/operator/options/suite_test.go b/pkg/operator/options/suite_test.go
@@ -19,6 +19,7 @@ import (
 	"flag"
 	"os"
 	"testing"
+	"time"
 
 	"github.com/samber/lo"
 	coreoptions "sigs.k8s.io/karpenter/pkg/operator/options"
@@ -63,7 +64,8 @@ var _ = Describe("Options", func() {
 			"--vm-memory-overhead-percent", "0.1",
 			"--interruption-queue", "env-cluster",
 			"--reserved-enis", "10",
-			"--disable-dry-run")
+			"--disable-dry-run",
+			"--ami-cache-ttl", "15m")
 		Expect(err).ToNot(HaveOccurred())
 		expectOptionsEqual(opts, test.Options(test.OptionsFields{
 			ClusterCABundle:         lo.ToPtr("env-bundle"),
@@ -74,6 +76,7 @@ var _ = Describe("Options", func() {
 			InterruptionQueue:       lo.ToPtr("env-cluster"),
 			ReservedENIs:            lo.ToPtr(10),
 			DisableDryRun:           lo.ToPtr(true),
+			AMICacheTTL:             lo.ToPtr(15 * time.Minute),
 		}))
 	})
 	It("should correctly fallback to env vars when CLI flags aren't set", func() {
@@ -85,6 +88,7 @@ var _ = Describe("Options", func() {
 		os.Setenv("INTERRUPTION_QUEUE", "env-cluster")
 		os.Setenv("RESERVED_ENIS", "10")
 		os.Setenv("DISABLE_DRY_RUN", "false")
+		os.Setenv("AMI_CACHE_TTL", "15m")
 
 		// Add flags after we set the environment variables so that the parsing logic correctly refers
 		// to the new environment variable values
@@ -100,6 +104,7 @@ var _ = Describe("Options", func() {
 			InterruptionQueue:       lo.ToPtr("env-cluster"),
 			ReservedENIs:            lo.ToPtr(10),
 			DisableDryRun:           lo.ToPtr(false),
+			AMICacheTTL:             lo.ToPtr(15 * time.Minute),
 		}))
 	})
 
@@ -123,6 +128,10 @@ var _ = Describe("Options", func() {
 			err := opts.Parse(fs, "--cluster-name", "test-cluster", "--reserved-enis", "-1")
 			Expect(err).To(HaveOccurred())
 		})
+		It("should fail when ami-cache-ttl is zero", func() {
+			err := opts.Parse(fs, "--cluster-name", "test-cluster", "--ami-cache-ttl", "0")
+			Expect(err).To(HaveOccurred())
+		})
 	})
 })
 
@@ -136,4 +145,5 @@ func expectOptionsEqual(optsA *options.Options, optsB *options.Options) {
 	Expect(optsA.InterruptionQueue).To(Equal(optsB.InterruptionQueue))
 	Expect(optsA.ReservedENIs).To(Equal(optsB.ReservedENIs))
 	Expect(optsA.DisableDryRun).To(Equal(optsB.DisableDryRun))
+	Expect(optsA.AMICacheTTL).To(Equal(optsB.AMICacheTTL))
 }
diff --git a/pkg/test/environment.go b/pkg/test/environment.go
@@ -117,7 +117,7 @@ func NewEnvironment(ctx context.Context, env *coretest.Environment) *Environment
 	iamapi := fake.NewIAMAPI()
 
 	// cache
-	amiCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval)
+	amiCache := cache.New(awscache.AMICacheTTL, awscache.DefaultCleanupInterval)
 	ec2Cache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval)
 	instanceTypeCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval)
 	instanceCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval)
diff --git a/pkg/test/options.go b/pkg/test/options.go
@@ -16,6 +16,7 @@ package test
 
 import (
 	"fmt"
+	"time"
 
 	"github.com/imdario/mergo"
 	"github.com/samber/lo"
@@ -33,6 +34,7 @@ type OptionsFields struct {
 	InterruptionQueue       *string
 	ReservedENIs            *int
 	DisableDryRun           *bool
+	AMICacheTTL             *time.Duration
 }
 
 func Options(overrides ...OptionsFields) *options.Options {
@@ -52,5 +54,6 @@ func Options(overrides ...OptionsFields) *options.Options {
 		InterruptionQueue:       lo.FromPtrOr(opts.InterruptionQueue, ""),
 		ReservedENIs:            lo.FromPtrOr(opts.ReservedENIs, 0),
 		DisableDryRun:           lo.FromPtrOr(opts.DisableDryRun, false),
+		AMICacheTTL:             lo.FromPtrOr(opts.AMICacheTTL, time.Minute),
 	}
 }
diff --git a/website/content/en/preview/reference/settings.md b/website/content/en/preview/reference/settings.md
@@ -12,6 +12,7 @@ Karpenter surfaces environment variables and CLI parameters to allow you to conf
 
 | Environment Variable | CLI Flag | Description |
 |--|--|--|
+| AMI_CACHE_TTL | \-\-ami-cache-ttl | TTL for cached AMI discovery results. (default = 1m0s)|
 | BATCH_IDLE_DURATION | \-\-batch-idle-duration | The maximum amount of time with no new pending pods that if exceeded ends the current batching window. If pods arrive faster than this time, the batching window will be extended up to the maxDuration. If they arrive slower, the pods will be batched separately. (default = 1s)|
 | BATCH_MAX_DURATION | \-\-batch-max-duration | The maximum length of a batch window. The longer this is, the more pods we can consider for provisioning at one time which usually results in fewer but larger nodes. (default = 10s)|
 | CLUSTER_CA_BUNDLE | \-\-cluster-ca-bundle | Cluster CA bundle for nodes to use for TLS connections with the API server. If not set, this is taken from the controller's TLS configuration.|

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@ func (o *Options) Validate() error {`
`28`	`28`	`o.validateVMMemoryOverheadPercent(),`
`29`	`29`	`o.validateReservedENIs(),`
`30`	`30`	`o.validateRequiredFields(),`
	`31`	`+ o.validateAMICacheTTL(),`
`31`	`32`	`)`
`32`	`33`	`}`
`33`	`34`
`@@ -64,3 +65,11 @@ func (o *Options) validateRequiredFields() error {`
`64`	`65`	`}`
`65`	`66`	`return nil`
`66`	`67`	`}`
	`68`	`+`
	`69`	`+func (o *Options) validateAMICacheTTL() error {`
	`70`	`+ if o.AMICacheTTL <= 0 {`
	`71`	`+ return fmt.Errorf("ami-cache-ttl must be positive")`
	`72`	`+ }`
	`73`	`+ return nil`
	`74`	`+}`
	`75`	`+`
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ package test`
`16`	`16`
`17`	`17`	`import (`
`18`	`18`	`"fmt"`
	`19`	`+ "time"`
`19`	`20`
`20`	`21`	`"github.com/imdario/mergo"`
`21`	`22`	`"github.com/samber/lo"`
`@@ -33,6 +34,7 @@ type OptionsFields struct {`
`33`	`34`	`InterruptionQueue *string`
`34`	`35`	`ReservedENIs *int`
`35`	`36`	`DisableDryRun *bool`
	`37`	`+ AMICacheTTL *time.Duration`
`36`	`38`	`}`
`37`	`39`
`38`	`40`	`func Options(overrides ...OptionsFields) *options.Options {`
`@@ -52,5 +54,6 @@ func Options(overrides ...OptionsFields) *options.Options {`
`52`	`54`	`InterruptionQueue: lo.FromPtrOr(opts.InterruptionQueue, ""),`
`53`	`55`	`ReservedENIs: lo.FromPtrOr(opts.ReservedENIs, 0),`
`54`	`56`	`DisableDryRun: lo.FromPtrOr(opts.DisableDryRun, false),`
	`57`	`+ AMICacheTTL: lo.FromPtrOr(opts.AMICacheTTL, time.Minute),`
`55`	`58`	`}`
`56`	`59`	`}`