Skip to content

Commit 3f28f13

Browse files
authored
fix: stabilize node condition transition time for multiple errors (#52)
1 parent 87002d2 commit 3f28f13

2 files changed

Lines changed: 157 additions & 2 deletions

File tree

pkg/manager/node_exporter.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package manager
33
import (
44
"context"
55
"fmt"
6+
"strings"
67
"sync"
78
"time"
89

@@ -120,9 +121,18 @@ func (e *nodeExporter) Fatal(ctx context.Context, monitorCondition monitor.Condi
120121
LastHeartbeatTime: now,
121122
}
122123
if oldCondition, ok := e.managedConditions[conditionType]; ok {
123-
// if the reason AND message have not changed, use the old transition time
124-
if oldCondition.Reason == newCondition.Reason && oldCondition.Message == newCondition.Message {
124+
// if the status has not changed, use the old transition time
125+
if oldCondition.Status == newCondition.Status {
125126
newCondition.LastTransitionTime = oldCondition.LastTransitionTime
127+
128+
// aggregate messages if the status is the same (e.g. both False)
129+
// and ensure that we don't duplicate identical messages.
130+
if oldCondition.Message != "" && oldCondition.Message != newCondition.Message && !strings.Contains(oldCondition.Message, newCondition.Message) {
131+
newCondition.Message = oldCondition.Message + "; " + newCondition.Message
132+
} else if strings.Contains(oldCondition.Message, newCondition.Message) {
133+
// if the old message already contains the new one, preserve the old one (which might have other aggregated messages)
134+
newCondition.Message = oldCondition.Message
135+
}
126136
}
127137
}
128138
e.managedConditions[conditionType] = newCondition

pkg/manager/node_exporter_test.go

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,3 +197,148 @@ func isConditionEqual(l corev1.NodeCondition, r corev1.NodeCondition) bool {
197197
l.Message == r.Message &&
198198
l.Reason == r.Reason
199199
}
200+
201+
func TestNodeExporter_LastTransitionTimeFlapping(t *testing.T) {
202+
ctx := context.TODO()
203+
fakeClient := fake.NewFakeClient()
204+
nodeName := "test-node"
205+
initialNode := corev1.Node{
206+
ObjectMeta: metav1.ObjectMeta{Name: nodeName},
207+
Status: corev1.NodeStatus{
208+
Conditions: []corev1.NodeCondition{
209+
{
210+
Type: "AcceleratedHardwareReady",
211+
Status: corev1.ConditionTrue,
212+
Reason: "Healthy",
213+
Message: "All good",
214+
LastHeartbeatTime: metav1.Now(),
215+
LastTransitionTime: metav1.Now(),
216+
},
217+
},
218+
},
219+
}
220+
if err := fakeClient.Create(ctx, &initialNode); err != nil {
221+
t.Fatalf("failed to create initial node: %v", err)
222+
}
223+
224+
nodeExporter := manager.NewNodeExporter(
225+
&initialNode,
226+
fakeClient,
227+
record.NewFakeRecorder(100),
228+
map[corev1.NodeConditionType]manager.NodeConditionConfig{
229+
"AcceleratedHardwareReady": {
230+
ReadyReason: "Healthy",
231+
ReadyMessage: "All good",
232+
},
233+
},
234+
)
235+
236+
heartbeatChan := make(chan time.Time)
237+
reportChan := make(chan time.Time)
238+
go nodeExporter.RunWithTickers(ctx, heartbeatChan, reportChan)
239+
240+
conditionType := corev1.NodeConditionType("AcceleratedHardwareReady")
241+
242+
// 1. Report first fatal error
243+
err1 := monitor.Condition{
244+
Reason: "ErrorA",
245+
Message: "MessageA",
246+
Severity: monitor.SeverityFatal,
247+
}
248+
if err := nodeExporter.Fatal(ctx, err1, conditionType); err != nil {
249+
t.Fatal(err)
250+
}
251+
252+
// Capture the transition time
253+
reportChan <- time.Now()
254+
255+
// Wait a bit for the report to process
256+
time.Sleep(time.Millisecond * 100)
257+
258+
var node corev1.Node
259+
if err := fakeClient.Get(ctx, client.ObjectKey{Name: nodeName}, &node); err != nil {
260+
t.Fatal(err)
261+
}
262+
263+
var ltt1 time.Time
264+
for _, c := range node.Status.Conditions {
265+
if c.Type == conditionType {
266+
ltt1 = c.LastTransitionTime.Time
267+
break
268+
}
269+
}
270+
if ltt1.IsZero() {
271+
t.Fatal("LastTransitionTime not set")
272+
}
273+
t.Logf("LTT1: %v", ltt1)
274+
275+
// Wait a bit to ensure 'now' changes (metav1.Now() has 1-second resolution)
276+
time.Sleep(time.Millisecond * 1100)
277+
278+
// 2. Report second fatal error with different message
279+
err2 := monitor.Condition{
280+
Reason: "ErrorB",
281+
Message: "MessageB",
282+
Severity: monitor.SeverityFatal,
283+
}
284+
if err := nodeExporter.Fatal(ctx, err2, conditionType); err != nil {
285+
t.Fatal(err)
286+
}
287+
288+
reportChan <- time.Now()
289+
time.Sleep(time.Millisecond * 200)
290+
291+
if err := fakeClient.Get(ctx, client.ObjectKey{Name: nodeName}, &node); err != nil {
292+
t.Fatal(err)
293+
}
294+
295+
var ltt2 time.Time
296+
for _, c := range node.Status.Conditions {
297+
if c.Type == conditionType {
298+
ltt2 = c.LastTransitionTime.Time
299+
break
300+
}
301+
}
302+
t.Logf("LTT2: %v", ltt2)
303+
304+
if ltt2.After(ltt1) {
305+
t.Errorf("LastTransitionTime flapped! It should have been preserved because status didn't change from False. ltt1: %v, ltt2: %v", ltt1, ltt2)
306+
}
307+
if !ltt2.Equal(ltt1) {
308+
t.Errorf("LastTransitionTime changed! It should have been identical. ltt1: %v, ltt2: %v", ltt1, ltt2)
309+
}
310+
311+
// Verify the message was still updated to the latest one
312+
var latestMessage string
313+
for _, c := range node.Status.Conditions {
314+
if c.Type == conditionType {
315+
latestMessage = c.Message
316+
break
317+
}
318+
}
319+
if latestMessage != "MessageA; MessageB" {
320+
t.Errorf("Message was not updated to latest or aggregated. expected: MessageA; MessageB, got: %s", latestMessage)
321+
}
322+
323+
// 3. Report same error again - should not duplicate in message
324+
if err := nodeExporter.Fatal(ctx, err1, conditionType); err != nil {
325+
t.Fatal(err)
326+
}
327+
reportChan <- time.Now()
328+
time.Sleep(time.Millisecond * 200)
329+
330+
if err := fakeClient.Get(ctx, client.ObjectKey{Name: nodeName}, &node); err != nil {
331+
t.Fatal(err)
332+
}
333+
334+
for _, c := range node.Status.Conditions {
335+
if c.Type == conditionType {
336+
latestMessage = c.Message
337+
break
338+
}
339+
}
340+
// It should still be "MessageA; MessageB" because MessageA is already contained in it.
341+
if latestMessage != "MessageA; MessageB" {
342+
t.Errorf("Message was incorrectly updated with duplicates or cleared. expected: MessageA; MessageB, got: %s", latestMessage)
343+
}
344+
}

0 commit comments

Comments
 (0)