Skip to content

Commit 49d7f3d

Browse files
committed
Improve timeout error messaging. Make CSOT tests more reliable.
1 parent 69502a1 commit 49d7f3d

File tree

4 files changed

+78
-43
lines changed

4 files changed

+78
-43
lines changed

mongo/integration/csot_test.go

Lines changed: 15 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -372,21 +372,25 @@ func TestCSOT(t *testing.T) {
372372
Data: mtest.FailPointData{
373373
FailCommands: []string{tc.commandName},
374374
BlockConnection: true,
375-
BlockTimeMS: 100,
375+
BlockTimeMS: 500,
376376
},
377377
})
378378

379379
tpm := eventtest.NewTestPoolMonitor()
380-
mt.ResetClient(options.Client().SetPoolMonitor(tpm.PoolMonitor))
380+
mt.ResetClient(options.Client().
381+
SetPoolMonitor(tpm.PoolMonitor))
381382

382383
// Run 5 operations that time out with CSOT disabled, then
383384
// assert that at least 1 connection was closed during those
384385
// timeouts.
385386
for i := 0; i < 5; i++ {
386-
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond)
387+
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Millisecond)
387388
err := tc.operation(ctx, mt.Coll)
388389
cancel()
389-
require.True(mt, mongo.IsTimeout(err), "expected a timeout error")
390+
391+
if !mongo.IsTimeout(err) {
392+
t.Logf("CSOT-disabled operation %d returned a non-timeout error: %v", i, err)
393+
}
390394
}
391395

392396
closedEvents := tpm.Events(func(pe *event.PoolEvent) bool {
@@ -405,10 +409,13 @@ func TestCSOT(t *testing.T) {
405409
// Run 5 operations that time out with CSOT enabled, then
406410
// assert that no connections were closed.
407411
for i := 0; i < 5; i++ {
408-
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond)
412+
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Millisecond)
409413
err := tc.operation(ctx, mt.Coll)
410414
cancel()
411-
require.True(mt, mongo.IsTimeout(err), "expected a timeout error")
415+
416+
if !mongo.IsTimeout(err) {
417+
t.Logf("CSOT-enabled operation %d returned a non-timeout error: %v", i, err)
418+
}
412419
}
413420

414421
closedEvents = tpm.Events(func(pe *event.PoolEvent) bool {
@@ -456,31 +463,6 @@ func TestCSOT_errors(t *testing.T) {
456463
err)
457464
})
458465

459-
// Test that, when CSOT is enabled, the error returned when a context
460-
// deadline is exceeded before a network operation wraps
461-
// "context.DeadlineExceeded".
462-
mt.Run("ErrDeadlineWouldBeExceeded wraps context.DeadlineExceeded", func(mt *mtest.T) {
463-
_, err := mt.Coll.InsertOne(context.Background(), bson.D{})
464-
require.NoError(mt, err, "InsertOne error")
465-
466-
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Microsecond)
467-
defer cancel()
468-
err = mt.Coll.FindOne(ctx, bson.D{}).Err()
469-
470-
assert.True(mt,
471-
errors.Is(err, driver.ErrDeadlineWouldBeExceeded),
472-
"expected error %[1]T(%[1]q) to wrap driver.ErrDeadlineWouldBeExceeded",
473-
err)
474-
assert.True(mt,
475-
errors.Is(err, context.DeadlineExceeded),
476-
"expected error %[1]T(%[1]q) to wrap context.DeadlineExceeded",
477-
err)
478-
assert.True(mt,
479-
mongo.IsTimeout(err),
480-
"expected error %[1]T(%[1]q) to be a timeout error",
481-
err)
482-
})
483-
484466
// Test that, when CSOT is enabled, the error returned when a context
485467
// deadline is exceeded during a network operation wraps
486468
// "context.DeadlineExceeded".
@@ -496,11 +478,11 @@ func TestCSOT_errors(t *testing.T) {
496478
Data: mtest.FailPointData{
497479
FailCommands: []string{"find"},
498480
BlockConnection: true,
499-
BlockTimeMS: 100,
481+
BlockTimeMS: 500,
500482
},
501483
})
502484

503-
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Millisecond)
485+
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Millisecond)
504486
defer cancel()
505487
err = mt.Coll.FindOne(ctx, bson.D{}).Err()
506488

x/mongo/driver/operation.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -688,7 +688,7 @@ func (op Operation) Execute(ctx context.Context) error {
688688
first = false
689689
}
690690

691-
maxTimeMS, err := op.calculateMaxTimeMS(ctx, srvr.RTTMonitor().P90())
691+
maxTimeMS, err := op.calculateMaxTimeMS(ctx, srvr.RTTMonitor())
692692
if err != nil {
693693
return err
694694
}
@@ -781,7 +781,7 @@ func (op Operation) Execute(ctx context.Context) error {
781781
} else if deadline, ok := ctx.Deadline(); ok {
782782
if csot.IsTimeoutContext(ctx) && time.Now().Add(srvr.RTTMonitor().P90()).After(deadline) {
783783
err = fmt.Errorf(
784-
"remaining time %v until context deadline is less than 90th percentile RTT: %w\n%v",
784+
"remaining time %v until context deadline is less than 90th percentile network round-trip time: %w\n%v",
785785
time.Until(deadline),
786786
ErrDeadlineWouldBeExceeded,
787787
srvr.RTTMonitor().Stats())
@@ -1566,25 +1566,25 @@ func (op Operation) addClusterTime(dst []byte, desc description.SelectedServer)
15661566
// if the ctx is a Timeout context. If the context is not a Timeout context, it uses the
15671567
// operation's MaxTimeMS if set. If no MaxTimeMS is set on the operation, and context is
15681568
// not a Timeout context, calculateMaxTimeMS returns 0.
1569-
func (op Operation) calculateMaxTimeMS(ctx context.Context, rtt90 time.Duration) (uint64, error) {
1569+
func (op Operation) calculateMaxTimeMS(ctx context.Context, mon RTTMonitor) (uint64, error) {
15701570
if csot.IsTimeoutContext(ctx) {
15711571
if op.OmitCSOTMaxTimeMS {
15721572
return 0, nil
15731573
}
15741574

15751575
if deadline, ok := ctx.Deadline(); ok {
15761576
remainingTimeout := time.Until(deadline)
1577+
rtt90 := mon.P90()
15771578
maxTime := remainingTimeout - rtt90
15781579

15791580
// Always round up to the next millisecond value so we never truncate the calculated
15801581
// maxTimeMS value (e.g. 400 microseconds evaluates to 1ms, not 0ms).
15811582
maxTimeMS := int64((maxTime + (time.Millisecond - 1)) / time.Millisecond)
15821583
if maxTimeMS <= 0 {
15831584
return 0, fmt.Errorf(
1584-
"maxTimeMS calculated by context deadline is negative "+
1585-
"(remaining time: %v, 90th percentile RTT %v): %w",
1585+
"negative maxTimeMS: remaining time %v until context deadline is less than 90th percentile network round-trip time (%v): %w",
15861586
remainingTimeout,
1587-
rtt90,
1587+
mon.Stats(),
15881588
ErrDeadlineWouldBeExceeded)
15891589
}
15901590

x/mongo/driver/operation_test.go

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,7 @@ func TestOperation(t *testing.T) {
328328
t.Run(tc.name, func(t *testing.T) {
329329
t.Parallel()
330330

331-
got, err := tc.op.calculateMaxTimeMS(tc.ctx, tc.rtt90)
331+
got, err := tc.op.calculateMaxTimeMS(tc.ctx, mockRTTMonitor{p90: tc.rtt90})
332332

333333
// Assert that the calculated maxTimeMS is less than or equal to the expected value. A few
334334
// milliseconds will have elapsed toward the context deadline, and (remainingTimeout
@@ -654,6 +654,35 @@ func TestOperation(t *testing.T) {
654654
// the TransientTransactionError label.
655655
assert.Equal(t, err, context.Canceled, "expected context.Canceled error, got %v", err)
656656
})
657+
t.Run("ErrDeadlineWouldBeExceeded wraps context.DeadlineExceeded", func(t *testing.T) {
658+
// Create a deployment that returns a server that reports a 90th
659+
// percentile RTT of 1 minute.
660+
d := new(mockDeployment)
661+
d.returns.server = mockServer{
662+
conn: new(mockConnection),
663+
rttMonitor: mockRTTMonitor{p90: 1 * time.Minute},
664+
}
665+
666+
// Create an operation with a Timeout specified to enable CSOT behavior.
667+
var dur time.Duration
668+
op := Operation{
669+
Database: "foobar",
670+
Deployment: d,
671+
CommandFn: func(dst []byte, desc description.SelectedServer) ([]byte, error) {
672+
return dst, nil
673+
},
674+
Timeout: &dur,
675+
}
676+
677+
// Call the operation with a context with a deadline less than the 90th
678+
// percentile RTT configured above.
679+
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
680+
defer cancel()
681+
err := op.Execute(ctx)
682+
683+
assert.ErrorIs(t, err, ErrDeadlineWouldBeExceeded)
684+
assert.ErrorIs(t, err, context.DeadlineExceeded)
685+
})
657686
}
658687

659688
func createExhaustServerResponse(response bsoncore.Document, moreToCome bool) []byte {
@@ -713,6 +742,27 @@ func (m *mockServerSelector) String() string {
713742
panic("not implemented")
714743
}
715744

745+
type mockServer struct {
746+
conn Connection
747+
err error
748+
rttMonitor RTTMonitor
749+
}
750+
751+
func (ms mockServer) Connection(context.Context) (Connection, error) { return ms.conn, ms.err }
752+
func (ms mockServer) RTTMonitor() RTTMonitor { return ms.rttMonitor }
753+
754+
type mockRTTMonitor struct {
755+
ewma time.Duration
756+
min time.Duration
757+
p90 time.Duration
758+
stats string
759+
}
760+
761+
func (mrm mockRTTMonitor) EWMA() time.Duration { return mrm.ewma }
762+
func (mrm mockRTTMonitor) Min() time.Duration { return mrm.min }
763+
func (mrm mockRTTMonitor) P90() time.Duration { return mrm.p90 }
764+
func (mrm mockRTTMonitor) Stats() string { return mrm.stats }
765+
716766
type mockConnection struct {
717767
// parameters
718768
pWriteWM []byte

x/mongo/driver/topology/rtt_monitor.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,10 @@ func (r *rttMonitor) Stats() string {
322322
}
323323
}
324324

325-
return fmt.Sprintf(`Round-trip-time monitor statistics:`+"\n"+
326-
`average RTT: %v, minimum RTT: %v, 90th percentile RTT: %v, standard dev: %v`+"\n",
327-
time.Duration(avg), r.minRTT, r.rtt90, time.Duration(stdDev))
325+
return fmt.Sprintf(
326+
"network round-trip time stats: avg: %v, min: %v, 90th pct: %v, stddev: %v",
327+
time.Duration(avg),
328+
r.minRTT,
329+
r.rtt90,
330+
time.Duration(stdDev))
328331
}

0 commit comments

Comments
 (0)