Skip to content

Commit 25043c1

Browse files
Backport of enhance TLS certificate monitoring with retry logic and severity logging into release/2.0.x (#23577)
backport of commit 8b8f3c5 Co-authored-by: Sanika Chavan <sanika.vikaschavan@hashicorp.com>
1 parent 87448b8 commit 25043c1

3 files changed

Lines changed: 185 additions & 12 deletions

File tree

agent/agent.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -890,7 +890,12 @@ func (a *Agent) Start(ctx context.Context) error {
890890
go a.retryJoinWAN()
891891
}
892892

893-
if a.config.Telemetry.CertificateEnabled && a.tlsConfigurator.Cert() != nil {
893+
shouldMonitorAgentTLS := a.config.Telemetry.CertificateEnabled &&
894+
(a.tlsConfigurator.Cert() != nil ||
895+
a.config.AutoEncryptTLS ||
896+
a.config.TLS.InternalRPC.CertFile != "" ||
897+
a.config.TLS.InternalRPC.KeyFile != "")
898+
if shouldMonitorAgentTLS {
894899
m := tlsCertExpirationMonitor(
895900
a.tlsConfigurator,
896901
a.config.Datacenter,

agent/consul/leader_metrics.go

Lines changed: 53 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ var (
2626

2727
// TestCertExpirationMonitorInterval overrides the default emission cadence in tests.
2828
TestCertExpirationMonitorInterval time.Duration
29+
30+
// certExpirationMonitorRetryInterval is used when metric emission fails (for example
31+
// when cert state is not yet initialized during startup). In that case we retry faster
32+
// than the normal hourly interval.
33+
certExpirationMonitorRetryInterval = 10 * time.Second
2934
)
3035

3136
var LeaderCertExpirationGauges = []prometheus.GaugeDefinition{
@@ -131,6 +136,27 @@ type CertExpirationMonitor struct {
131136

132137
const certExpirationMonitorInterval = time.Hour
133138

139+
func certDaysRemaining(untilAfter time.Duration) int {
140+
return int(math.Floor(untilAfter.Hours() / 24))
141+
}
142+
143+
func certLogSeverity(untilAfter time.Duration, criticalDays, warningDays int) string {
144+
if untilAfter <= 0 {
145+
return "expired"
146+
}
147+
148+
criticalThreshold := time.Duration(criticalDays) * 24 * time.Hour
149+
warningThreshold := time.Duration(warningDays) * 24 * time.Hour
150+
switch {
151+
case untilAfter <= criticalThreshold:
152+
return "critical"
153+
case untilAfter <= warningThreshold:
154+
return "warning"
155+
default:
156+
return "ok"
157+
}
158+
}
159+
134160
func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
135161

136162
// Check if certificate telemetry is enabled (only for server-based monitors)
@@ -142,20 +168,22 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
142168
if m.Interval > 0 {
143169
interval = m.Interval
144170
}
145-
146-
ticker := time.NewTicker(interval)
147-
defer ticker.Stop()
171+
retryInterval := certExpirationMonitorRetryInterval
172+
if interval < retryInterval {
173+
retryInterval = interval
174+
}
148175

149176
logger := m.Logger.With("metric", strings.Join(m.Key, "."))
150177

151-
emitMetric := func() {
178+
emitMetric := func() bool {
152179
_, untilAfter, err := m.Query()
153180
if err != nil {
154181
logger.Warn("failed to emit certificate expiry metric", "error", err)
155-
return
182+
metrics.SetGaugeWithLabels(m.Key, float32(math.NaN()), m.Labels)
183+
return false
156184
}
157185

158-
daysRemaining := int(untilAfter.Hours() / 24)
186+
daysRemaining := certDaysRemaining(untilAfter)
159187

160188
// Get thresholds from Server config or use provided values
161189
var criticalDays, warningDays int
@@ -209,19 +237,29 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
209237
}
210238

211239
// Log based on threshold severity with detailed context
212-
if daysRemaining <= criticalDays {
240+
switch certLogSeverity(untilAfter, criticalDays, warningDays) {
241+
case "expired":
242+
logger.Error("certificate has expired", logFields...)
243+
case "critical":
213244
logger.Error("certificate expiring soon", logFields...)
214-
} else if daysRemaining <= warningDays {
245+
case "warning":
215246
logger.Warn("certificate expiring soon", logFields...)
216247
}
217248

218249
expiry := untilAfter / time.Second
219250
metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)
251+
return true
220252
}
221253

222254
// emit the metric immediately so that if a cert was just updated the
223255
// new metric will be updated to the new expiration time.
224-
emitMetric()
256+
nextInterval := interval
257+
if !emitMetric() {
258+
nextInterval = retryInterval
259+
}
260+
261+
timer := time.NewTimer(nextInterval)
262+
defer timer.Stop()
225263

226264
for {
227265
select {
@@ -230,8 +268,12 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
230268
// metric from a non-leader, it does not get a stale value.
231269
metrics.SetGaugeWithLabels(m.Key, float32(math.NaN()), m.Labels)
232270
return nil
233-
case <-ticker.C:
234-
emitMetric()
271+
case <-timer.C:
272+
nextInterval = interval
273+
if !emitMetric() {
274+
nextInterval = retryInterval
275+
}
276+
timer.Reset(nextInterval)
235277
}
236278
}
237279
}

agent/consul/leader_metrics_test.go

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,15 @@
44
package consul
55

66
import (
7+
"context"
8+
"errors"
9+
"sync/atomic"
710
"testing"
11+
"time"
12+
13+
"github.com/armon/go-metrics"
14+
15+
"github.com/hashicorp/go-hclog"
816
)
917

1018
// TestExpiresSoon was removed as we now use configurable thresholds
@@ -259,3 +267,121 @@ func TestCertificateTelemetry_Disabled(t *testing.T) {
259267
t.Error("telemetry should be disabled")
260268
}
261269
}
270+
271+
func TestCertExpirationMonitor_RetriesQuicklyAfterQueryFailure(t *testing.T) {
272+
oldRetry := certExpirationMonitorRetryInterval
273+
certExpirationMonitorRetryInterval = 20 * time.Millisecond
274+
t.Cleanup(func() {
275+
certExpirationMonitorRetryInterval = oldRetry
276+
})
277+
278+
var calls int32
279+
success := make(chan struct{}, 1)
280+
m := CertExpirationMonitor{
281+
Key: []string{"test", "cert", "expiry"},
282+
Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}},
283+
Logger: hclog.NewNullLogger(),
284+
Interval: time.Hour,
285+
Query: func() (time.Duration, time.Duration, error) {
286+
if atomic.AddInt32(&calls, 1) == 1 {
287+
return 0, 0, errors.New("not ready")
288+
}
289+
select {
290+
case success <- struct{}{}:
291+
default:
292+
}
293+
return 24 * time.Hour, 23 * time.Hour, nil
294+
},
295+
}
296+
297+
ctx, cancel := context.WithCancel(context.Background())
298+
defer cancel()
299+
300+
done := make(chan error, 1)
301+
go func() {
302+
done <- m.Monitor(ctx)
303+
}()
304+
305+
select {
306+
case <-success:
307+
cancel()
308+
case <-time.After(2 * time.Second):
309+
t.Fatal("monitor did not retry quickly after initial query failure")
310+
}
311+
312+
select {
313+
case err := <-done:
314+
if err != nil {
315+
t.Fatalf("monitor returned unexpected error: %v", err)
316+
}
317+
case <-time.After(2 * time.Second):
318+
t.Fatal("monitor did not exit after cancellation")
319+
}
320+
321+
if got := atomic.LoadInt32(&calls); got < 2 {
322+
t.Fatalf("expected at least 2 query attempts, got %d", got)
323+
}
324+
}
325+
326+
func TestCertLogSeverity(t *testing.T) {
327+
tests := []struct {
328+
name string
329+
untilAfter time.Duration
330+
critical int
331+
warning int
332+
wantLevel string
333+
wantMinDays int
334+
}{
335+
{
336+
name: "expired_subsecond",
337+
untilAfter: -565 * time.Millisecond,
338+
critical: 7,
339+
warning: 30,
340+
wantLevel: "expired",
341+
wantMinDays: -1,
342+
},
343+
{
344+
name: "expired_one_hour",
345+
untilAfter: -1 * time.Hour,
346+
critical: 7,
347+
warning: 30,
348+
wantLevel: "expired",
349+
wantMinDays: -1,
350+
},
351+
{
352+
name: "critical_threshold",
353+
untilAfter: 6 * 24 * time.Hour,
354+
critical: 7,
355+
warning: 30,
356+
wantLevel: "critical",
357+
wantMinDays: 6,
358+
},
359+
{
360+
name: "warning_threshold",
361+
untilAfter: 10 * 24 * time.Hour,
362+
critical: 7,
363+
warning: 30,
364+
wantLevel: "warning",
365+
wantMinDays: 10,
366+
},
367+
{
368+
name: "ok",
369+
untilAfter: 60 * 24 * time.Hour,
370+
critical: 7,
371+
warning: 30,
372+
wantLevel: "ok",
373+
wantMinDays: 60,
374+
},
375+
}
376+
377+
for _, tt := range tests {
378+
t.Run(tt.name, func(t *testing.T) {
379+
if got := certLogSeverity(tt.untilAfter, tt.critical, tt.warning); got != tt.wantLevel {
380+
t.Fatalf("expected severity %q, got %q", tt.wantLevel, got)
381+
}
382+
if got := certDaysRemaining(tt.untilAfter); got < tt.wantMinDays {
383+
t.Fatalf("expected days remaining >= %d, got %d", tt.wantMinDays, got)
384+
}
385+
})
386+
}
387+
}

0 commit comments

Comments
 (0)