Skip to content

Commit 0a03d36

Browse files
committed
✨ Add ReconcileTimeouts metric to track ReconciliationTimeout timeouts
This PR adds a new metric controller_runtime_reconcile_timeouts_total to track when a controller's reconciliation has reached a ReconciliationTimeout. This provides visibility into when reconcile operations time out due to the controller-runtime wrapper timeout, allowing users to alert / monitor unexpectedly long running controller reconiliations. Fixes #3381
1 parent 6724282 commit 0a03d36

File tree

3 files changed

+176
-2
lines changed

3 files changed

+176
-2
lines changed

pkg/internal/controller/controller.go

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ import (
3939
"sigs.k8s.io/controller-runtime/pkg/source"
4040
)
4141

42+
// errReconciliationTimeout is the error used as the cause when the ReconciliationTimeout guardrail fires.
43+
// This allows us to distinguish wrapper timeouts from user-initiated context cancellations.
44+
var errReconciliationTimeout = errors.New("reconciliation timeout")
45+
4246
// Options are the arguments for creating a new Controller.
4347
type Options[request comparable] struct {
4448
// Reconciler is a function that can be called at any time with the Name / Namespace of an object and
@@ -207,13 +211,26 @@ func (c *Controller[request]) Reconcile(ctx context.Context, req request) (_ rec
207211
}
208212
}()
209213

214+
var timeoutCause error
210215
if c.ReconciliationTimeout > 0 {
216+
timeoutCause = errReconciliationTimeout
211217
var cancel context.CancelFunc
212-
ctx, cancel = context.WithTimeout(ctx, c.ReconciliationTimeout)
218+
ctx, cancel = context.WithTimeoutCause(ctx, c.ReconciliationTimeout, timeoutCause)
213219
defer cancel()
214220
}
215221

216-
return c.Do.Reconcile(ctx, req)
222+
res, err := c.Do.Reconcile(ctx, req)
223+
224+
// Check if the reconciliation timed out due to our wrapper timeout guardrail.
225+
// We check ctx.Err() == context.DeadlineExceeded first to ensure the context was actually
226+
// cancelled due to a deadline (not parent cancellation or other reasons), then verify it was
227+
// our specific timeout cause. This prevents false positives from parent context cancellations
228+
// or other timeout scenarios.
229+
if timeoutCause != nil && ctx.Err() == context.DeadlineExceeded && errors.Is(context.Cause(ctx), timeoutCause) {
230+
ctrlmetrics.ReconcileTimeouts.WithLabelValues(c.Name).Inc()
231+
}
232+
233+
return res, err
217234
}
218235

219236
// Watch implements controller.Controller.
@@ -437,6 +454,7 @@ func (c *Controller[request]) initMetrics() {
437454
ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Add(0)
438455
ctrlmetrics.TerminalReconcileErrors.WithLabelValues(c.Name).Add(0)
439456
ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Add(0)
457+
ctrlmetrics.ReconcileTimeouts.WithLabelValues(c.Name).Add(0)
440458
ctrlmetrics.WorkerCount.WithLabelValues(c.Name).Set(float64(c.MaxConcurrentReconciles))
441459
ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Set(0)
442460
}

pkg/internal/controller/controller_test.go

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,152 @@ var _ = Describe("controller", func() {
155155
Expect(err).To(Equal(context.DeadlineExceeded))
156156
})
157157

158+
Context("prometheus metric reconcile_timeouts", func() {
159+
var reconcileTimeouts dto.Metric
160+
161+
BeforeEach(func() {
162+
ctrlmetrics.ReconcileTimeouts.Reset()
163+
reconcileTimeouts.Reset()
164+
ctrl.Name = testControllerName
165+
})
166+
167+
It("should increment when ReconciliationTimeout fires", func(ctx SpecContext) {
168+
Expect(func() error {
169+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
170+
if reconcileTimeouts.GetCounter().GetValue() != 0.0 {
171+
return fmt.Errorf("metric reconcile timeouts not reset")
172+
}
173+
return nil
174+
}()).Should(Succeed())
175+
176+
ctrl.ReconciliationTimeout = time.Duration(1) // One nanosecond
177+
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
178+
<-ctx.Done()
179+
return reconcile.Result{}, ctx.Err()
180+
})
181+
_, err := ctrl.Reconcile(ctx,
182+
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
183+
Expect(err).To(HaveOccurred())
184+
Expect(err).To(Equal(context.DeadlineExceeded))
185+
186+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
187+
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(1.0))
188+
})
189+
190+
It("should not increment when user code cancels context early", func(ctx SpecContext) {
191+
Expect(func() error {
192+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
193+
if reconcileTimeouts.GetCounter().GetValue() != 0.0 {
194+
return fmt.Errorf("metric reconcile timeouts not reset")
195+
}
196+
return nil
197+
}()).Should(Succeed())
198+
199+
ctrl.ReconciliationTimeout = 10 * time.Second
200+
userCancelCause := errors.New("user cancellation")
201+
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
202+
// User code creates its own timeout with a different cause
203+
userCtx, cancel := context.WithTimeoutCause(ctx, time.Millisecond, userCancelCause)
204+
defer cancel()
205+
<-userCtx.Done()
206+
return reconcile.Result{}, context.Cause(userCtx)
207+
})
208+
_, err := ctrl.Reconcile(ctx,
209+
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
210+
Expect(err).To(HaveOccurred())
211+
Expect(errors.Is(err, userCancelCause)).To(BeTrue())
212+
213+
// Metric should not be incremented because the wrapper timeout didn't fire
214+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
215+
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0))
216+
})
217+
218+
It("should not increment when reconciliation completes before timeout", func(ctx SpecContext) {
219+
Expect(func() error {
220+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
221+
if reconcileTimeouts.GetCounter().GetValue() != 0.0 {
222+
return fmt.Errorf("metric reconcile timeouts not reset")
223+
}
224+
return nil
225+
}()).Should(Succeed())
226+
227+
ctrl.ReconciliationTimeout = 10 * time.Second
228+
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
229+
// Reconcile completes successfully before timeout
230+
return reconcile.Result{}, nil
231+
})
232+
_, err := ctrl.Reconcile(ctx,
233+
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
234+
Expect(err).NotTo(HaveOccurred())
235+
236+
// Metric should not be incremented because the timeout was not exceeded
237+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
238+
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0))
239+
})
240+
241+
It("should increment multiple times when multiple reconciles timeout", func(ctx SpecContext) {
242+
Expect(func() error {
243+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
244+
if reconcileTimeouts.GetCounter().GetValue() != 0.0 {
245+
return fmt.Errorf("metric reconcile timeouts not reset")
246+
}
247+
return nil
248+
}()).Should(Succeed())
249+
250+
ctrl.ReconciliationTimeout = time.Duration(1) // One nanosecond
251+
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
252+
<-ctx.Done()
253+
return reconcile.Result{}, ctx.Err()
254+
})
255+
256+
const numTimeouts = 3
257+
// Call Reconcile multiple times, each should timeout and increment the metric
258+
for i := range numTimeouts {
259+
_, err := ctrl.Reconcile(ctx,
260+
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: fmt.Sprintf("bar%d", i)}})
261+
Expect(err).To(HaveOccurred())
262+
Expect(err).To(Equal(context.DeadlineExceeded))
263+
}
264+
265+
// Metric should be incremented 3 times
266+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
267+
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(float64(numTimeouts)))
268+
})
269+
270+
It("should not increment when parent context is cancelled", func(ctx SpecContext) {
271+
Expect(func() error {
272+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
273+
if reconcileTimeouts.GetCounter().GetValue() != 0.0 {
274+
return fmt.Errorf("metric reconcile timeouts not reset")
275+
}
276+
return nil
277+
}()).Should(Succeed())
278+
279+
// Create a parent context that will be cancelled
280+
parentCtx, cancel := context.WithCancel(ctx)
281+
defer cancel()
282+
283+
ctrl.ReconciliationTimeout = 10 * time.Second
284+
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
285+
// Wait for parent cancellation
286+
<-ctx.Done()
287+
return reconcile.Result{}, ctx.Err()
288+
})
289+
290+
// Cancel parent context immediately
291+
cancel()
292+
293+
_, err := ctrl.Reconcile(parentCtx,
294+
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
295+
Expect(err).To(HaveOccurred())
296+
Expect(err).To(Equal(context.Canceled))
297+
298+
// Metric should not be incremented because the wrapper timeout didn't fire
299+
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
300+
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0))
301+
})
302+
})
303+
158304
It("should not configure a timeout if ReconciliationTimeout is zero", func(ctx SpecContext) {
159305
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
160306
defer GinkgoRecover()

pkg/internal/controller/metrics/metrics.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,15 @@ var (
8080
Name: "controller_runtime_active_workers",
8181
Help: "Number of currently used workers per controller",
8282
}, []string{"controller"})
83+
84+
// ReconcileTimeouts is a prometheus counter metric which holds the total
85+
// number of reconciliations that timed out due to the ReconciliationTimeout
86+
// context timeout. This metric only increments when the wrapper timeout fires,
87+
// not when user reconcilers cancels the context or completes before the timeout.
88+
ReconcileTimeouts = prometheus.NewCounterVec(prometheus.CounterOpts{
89+
Name: "controller_runtime_reconcile_timeouts_total",
90+
Help: "Total number of reconciliation timeouts per controller",
91+
}, []string{"controller"})
8392
)
8493

8594
func init() {
@@ -91,6 +100,7 @@ func init() {
91100
ReconcileTime,
92101
WorkerCount,
93102
ActiveWorkers,
103+
ReconcileTimeouts,
94104
// expose process metrics like CPU, Memory, file descriptor usage etc.
95105
collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
96106
// expose all Go runtime metrics like GC stats, memory stats etc.

0 commit comments

Comments
 (0)