2626
2727 // TestCertExpirationMonitorInterval overrides the default emission cadence in tests.
2828 TestCertExpirationMonitorInterval time.Duration
29+
30+ // certExpirationMonitorRetryInterval is used when metric emission fails (for example
31+ // when cert state is not yet initialized during startup). In that case we retry faster
32+ // than the normal hourly interval.
33+ certExpirationMonitorRetryInterval = 10 * time .Second
2934)
3035
3136var LeaderCertExpirationGauges = []prometheus.GaugeDefinition {
@@ -131,6 +136,27 @@ type CertExpirationMonitor struct {
131136
132137const certExpirationMonitorInterval = time .Hour
133138
139+ func certDaysRemaining (untilAfter time.Duration ) int {
140+ return int (math .Floor (untilAfter .Hours () / 24 ))
141+ }
142+
143+ func certLogSeverity (untilAfter time.Duration , criticalDays , warningDays int ) string {
144+ if untilAfter <= 0 {
145+ return "expired"
146+ }
147+
148+ criticalThreshold := time .Duration (criticalDays ) * 24 * time .Hour
149+ warningThreshold := time .Duration (warningDays ) * 24 * time .Hour
150+ switch {
151+ case untilAfter <= criticalThreshold :
152+ return "critical"
153+ case untilAfter <= warningThreshold :
154+ return "warning"
155+ default :
156+ return "ok"
157+ }
158+ }
159+
134160func (m CertExpirationMonitor ) Monitor (ctx context.Context ) error {
135161
136162 // Check if certificate telemetry is enabled (only for server-based monitors)
@@ -142,20 +168,22 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
142168 if m .Interval > 0 {
143169 interval = m .Interval
144170 }
145-
146- ticker := time .NewTicker (interval )
147- defer ticker .Stop ()
171+ retryInterval := certExpirationMonitorRetryInterval
172+ if interval < retryInterval {
173+ retryInterval = interval
174+ }
148175
149176 logger := m .Logger .With ("metric" , strings .Join (m .Key , "." ))
150177
151- emitMetric := func () {
178+ emitMetric := func () bool {
152179 _ , untilAfter , err := m .Query ()
153180 if err != nil {
154181 logger .Warn ("failed to emit certificate expiry metric" , "error" , err )
155- return
182+ metrics .SetGaugeWithLabels (m .Key , float32 (math .NaN ()), m .Labels )
183+ return false
156184 }
157185
158- daysRemaining := int (untilAfter . Hours () / 24 )
186+ daysRemaining := certDaysRemaining (untilAfter )
159187
160188 // Get thresholds from Server config or use provided values
161189 var criticalDays , warningDays int
@@ -209,19 +237,29 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
209237 }
210238
211239 // Log based on threshold severity with detailed context
212- if daysRemaining <= criticalDays {
240+ switch certLogSeverity (untilAfter , criticalDays , warningDays ) {
241+ case "expired" :
242+ logger .Error ("certificate has expired" , logFields ... )
243+ case "critical" :
213244 logger .Error ("certificate expiring soon" , logFields ... )
214- } else if daysRemaining <= warningDays {
245+ case "warning" :
215246 logger .Warn ("certificate expiring soon" , logFields ... )
216247 }
217248
218249 expiry := untilAfter / time .Second
219250 metrics .SetGaugeWithLabels (m .Key , float32 (expiry ), m .Labels )
251+ return true
220252 }
221253
222254 // emit the metric immediately so that if a cert was just updated the
223255 // new metric will be updated to the new expiration time.
224- emitMetric ()
256+ nextInterval := interval
257+ if ! emitMetric () {
258+ nextInterval = retryInterval
259+ }
260+
261+ timer := time .NewTimer (nextInterval )
262+ defer timer .Stop ()
225263
226264 for {
227265 select {
@@ -230,8 +268,12 @@ func (m CertExpirationMonitor) Monitor(ctx context.Context) error {
230268 // metric from a non-leader, it does not get a stale value.
231269 metrics .SetGaugeWithLabels (m .Key , float32 (math .NaN ()), m .Labels )
232270 return nil
233- case <- ticker .C :
234- emitMetric ()
271+ case <- timer .C :
272+ nextInterval = interval
273+ if ! emitMetric () {
274+ nextInterval = retryInterval
275+ }
276+ timer .Reset (nextInterval )
235277 }
236278 }
237279}
0 commit comments