Skip to content

Commit e413838

Browse files
authored
client: Change connectivity state to CONNECTING when creating the name resolver (#8710)
Fixes #7686 #### Current Behavior - When client exits IDLE and creates the name resolver, it stays in IDLE until the connectivity state is set by the LB policy. - When exiting IDLE mode (because of `Connect` being called or because of an RPC), if name resolver creation fails, we stay in IDLE. #### New Behavior - When the client exits IDLE and creates the name resolver, it moves to CONNECTING. Moving forward, the connectivity state will be set by the LB policy. - When exiting IDLE mode (because of `Connect` being called or because of an RPC), we have already moved to CONNECTING (because of the previous bullet point). If name resolver creation fails, we will move to TRANSIENT_FAILURE and start the idle timer and move back to IDLE when the timer fires #### Implementation details: - The client channel now treats resolver build errors encountered during exiting IDLE identically to resolver errors received prior to valid updates. - `Build` uses a new unsafe API on the idleness manager to mark the channel as exited IDLE. - The idleness Manager invokes the channel's `ExitIdleMode` (which now does not return an error) and updates internal state to reflect that it is no longer in IDLE. - `OnFinish` call options are now invoked even if stream creation fails during an RPC. This fulfills the guarantee for these options and ensures the idleness Manager’s `activeCallsCount` remains accurate. RELEASE NOTES: - client: Change connectivity state to CONNECTING when creating the name resolver (as part of exiting IDLE). - client: Change connectivity state to TRANSIENT_FAILURE if name resolver creation fails (as part of exiting IDLE). - client: Change connectivity state to IDLE after idle timeout expires even when current state is TRANSIENT_FAILURE. - client: Fix a bug that resulted in `OnFinish` call option not being invoked for RPCs where stream creation failed.
1 parent f9d2bdb commit e413838

File tree

10 files changed

+383
-174
lines changed

10 files changed

+383
-174
lines changed

clientconn.go

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -262,9 +262,10 @@ func DialContext(ctx context.Context, target string, opts ...DialOption) (conn *
262262
}()
263263

264264
// This creates the name resolver, load balancer, etc.
265-
if err := cc.idlenessMgr.ExitIdleMode(); err != nil {
266-
return nil, err
265+
if err := cc.exitIdleMode(); err != nil {
266+
return nil, fmt.Errorf("failed to exit idle mode: %w", err)
267267
}
268+
cc.idlenessMgr.UnsafeSetNotIdle()
268269

269270
// Return now for non-blocking dials.
270271
if !cc.dopts.block {
@@ -332,7 +333,7 @@ func (cc *ClientConn) addTraceEvent(msg string) {
332333
Severity: channelz.CtInfo,
333334
}
334335
}
335-
channelz.AddTraceEvent(logger, cc.channelz, 0, ted)
336+
channelz.AddTraceEvent(logger, cc.channelz, 1, ted)
336337
}
337338

338339
type idler ClientConn
@@ -341,26 +342,41 @@ func (i *idler) EnterIdleMode() {
341342
(*ClientConn)(i).enterIdleMode()
342343
}
343344

344-
func (i *idler) ExitIdleMode() error {
345-
return (*ClientConn)(i).exitIdleMode()
345+
func (i *idler) ExitIdleMode() {
346+
// Ignore the error returned from this method, because from the perspective
347+
// of the caller (idleness manager), the channel would have always moved out
348+
// of IDLE by the time this method returns.
349+
(*ClientConn)(i).exitIdleMode()
346350
}
347351

348352
// exitIdleMode moves the channel out of idle mode by recreating the name
349353
// resolver and load balancer. This should never be called directly; use
350354
// cc.idlenessMgr.ExitIdleMode instead.
351-
func (cc *ClientConn) exitIdleMode() (err error) {
355+
func (cc *ClientConn) exitIdleMode() error {
352356
cc.mu.Lock()
353357
if cc.conns == nil {
354358
cc.mu.Unlock()
355359
return errConnClosing
356360
}
357361
cc.mu.Unlock()
358362

363+
// Set state to CONNECTING before building the name resolver
364+
// so the channel does not remain in IDLE.
365+
cc.csMgr.updateState(connectivity.Connecting)
366+
359367
// This needs to be called without cc.mu because this builds a new resolver
360368
// which might update state or report error inline, which would then need to
361369
// acquire cc.mu.
362370
if err := cc.resolverWrapper.start(); err != nil {
363-
return err
371+
// If resolver creation fails, treat it like an error reported by the
372+
// resolver before any valid udpates. Set channel's state to
373+
// TransientFailure, and set an erroring picker with the resolver build
374+
// error, which will returned as part of any subsequent RPCs.
375+
logger.Warningf("Failed to start resolver: %v", err)
376+
cc.csMgr.updateState(connectivity.TransientFailure)
377+
cc.mu.Lock()
378+
cc.updateResolverStateAndUnlock(resolver.State{}, err)
379+
return fmt.Errorf("failed to start resolver: %w", err)
364380
}
365381

366382
cc.addTraceEvent("exiting idle mode")
@@ -681,10 +697,8 @@ func (cc *ClientConn) GetState() connectivity.State {
681697
// Notice: This API is EXPERIMENTAL and may be changed or removed in a later
682698
// release.
683699
func (cc *ClientConn) Connect() {
684-
if err := cc.idlenessMgr.ExitIdleMode(); err != nil {
685-
cc.addTraceEvent(err.Error())
686-
return
687-
}
700+
cc.idlenessMgr.ExitIdleMode()
701+
688702
// If the ClientConn was not in idle mode, we need to call ExitIdle on the
689703
// LB policy so that connections can be created.
690704
cc.mu.Lock()
@@ -735,8 +749,8 @@ func init() {
735749
internal.EnterIdleModeForTesting = func(cc *ClientConn) {
736750
cc.idlenessMgr.EnterIdleModeForTesting()
737751
}
738-
internal.ExitIdleModeForTesting = func(cc *ClientConn) error {
739-
return cc.idlenessMgr.ExitIdleMode()
752+
internal.ExitIdleModeForTesting = func(cc *ClientConn) {
753+
cc.idlenessMgr.ExitIdleMode()
740754
}
741755
}
742756

clientconn_parsed_target_test.go

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -205,28 +205,31 @@ func (s) TestParsedTarget_Failure_WithoutCustomDialer(t *testing.T) {
205205
}
206206

207207
func (s) TestParsedTarget_Failure_WithoutCustomDialer_WithNewClient(t *testing.T) {
208-
targets := []string{
209-
"",
210-
"unix://a/b/c",
211-
"unix://authority",
212-
"unix-abstract://authority/a/b/c",
213-
"unix-abstract://authority",
208+
tests := []struct {
209+
target string
210+
wantErrSubstr string
211+
}{
212+
213+
{target: "", wantErrSubstr: "invalid target address"},
214+
{target: "unix://a/b/c", wantErrSubstr: "invalid (non-empty) authority"},
215+
{target: "unix://authority", wantErrSubstr: "invalid (non-empty) authority"},
216+
{target: "unix-abstract://authority/a/b/c", wantErrSubstr: "invalid (non-empty) authority"},
217+
{target: "unix-abstract://authority", wantErrSubstr: "invalid (non-empty) authority"},
214218
}
215219

216-
for _, target := range targets {
217-
t.Run(target, func(t *testing.T) {
220+
for _, test := range tests {
221+
t.Run(test.target, func(t *testing.T) {
218222
ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
219223
defer cancel()
220-
cc, err := NewClient(target, WithTransportCredentials(insecure.NewCredentials()))
224+
cc, err := NewClient(test.target, WithTransportCredentials(insecure.NewCredentials()))
221225
if err != nil {
222-
t.Fatalf("NewClient(%q) failed: %v", target, err)
226+
t.Fatalf("NewClient(%q) failed: %v", test, err)
223227
}
224228
defer cc.Close()
225-
const wantErrSubstr = "failed to exit idle mode"
226229
if _, err := cc.NewStream(ctx, &StreamDesc{}, "/my.service.v1.MyService/UnaryCall"); err == nil {
227-
t.Fatalf("NewStream() succeeded with target = %q, cc.parsedTarget = %+v, expected to fail", target, cc.parsedTarget)
228-
} else if !strings.Contains(err.Error(), wantErrSubstr) {
229-
t.Fatalf("NewStream() with target = %q returned unexpected error: got %v, want substring %q", target, err, wantErrSubstr)
230+
t.Fatalf("NewStream() succeeded with target = %q, cc.parsedTarget = %+v, expected to fail", test, cc.parsedTarget)
231+
} else if !strings.Contains(err.Error(), test.wantErrSubstr) {
232+
t.Fatalf("NewStream() with target = %q returned unexpected error: got %v, want substring %q", test, err, test.wantErrSubstr)
230233
}
231234
})
232235
}

dial_test.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package grpc
2020

2121
import (
2222
"context"
23+
"fmt"
2324
"net"
2425
"strings"
2526
"testing"
@@ -312,3 +313,36 @@ func (s) TestResolverAddressesWithTypedNilAttribute(t *testing.T) {
312313
type stringerVal struct{ s string }
313314

314315
func (s stringerVal) String() string { return s.s }
316+
317+
const errResolverBuilderScheme = "test-resolver-build-failure"
318+
319+
// errResolverBuilder is a resolver builder that returns an error from its Build
320+
// method.
321+
type errResolverBuilder struct {
322+
err error
323+
}
324+
325+
func (b *errResolverBuilder) Build(resolver.Target, resolver.ClientConn, resolver.BuildOptions) (resolver.Resolver, error) {
326+
return nil, b.err
327+
}
328+
329+
func (b *errResolverBuilder) Scheme() string {
330+
return errResolverBuilderScheme
331+
}
332+
333+
// Tests that Dial returns an error if the resolver builder returns an error
334+
// from its Build method.
335+
func (s) TestDial_ResolverBuilder_Error(t *testing.T) {
336+
resolverErr := fmt.Errorf("resolver builder error")
337+
dopts := []DialOption{
338+
WithTransportCredentials(insecure.NewCredentials()),
339+
WithResolvers(&errResolverBuilder{err: resolverErr}),
340+
}
341+
_, err := Dial(errResolverBuilderScheme+":///test.server", dopts...)
342+
if err == nil {
343+
t.Fatalf("Dial() succeeded when it should have failed")
344+
}
345+
if !strings.Contains(err.Error(), resolverErr.Error()) {
346+
t.Fatalf("Dial() failed with error %v, want %v", err, resolverErr)
347+
}
348+
}

internal/idle/idle.go

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
package idle
2222

2323
import (
24-
"fmt"
2524
"math"
2625
"sync"
2726
"sync/atomic"
@@ -33,15 +32,15 @@ var timeAfterFunc = func(d time.Duration, f func()) *time.Timer {
3332
return time.AfterFunc(d, f)
3433
}
3534

36-
// Enforcer is the functionality provided by grpc.ClientConn to enter
37-
// and exit from idle mode.
38-
type Enforcer interface {
39-
ExitIdleMode() error
35+
// ClientConn is the functionality provided by grpc.ClientConn to enter and exit
36+
// from idle mode.
37+
type ClientConn interface {
38+
ExitIdleMode()
4039
EnterIdleMode()
4140
}
4241

43-
// Manager implements idleness detection and calls the configured Enforcer to
44-
// enter/exit idle mode when appropriate. Must be created by NewManager.
42+
// Manager implements idleness detection and calls the ClientConn to enter/exit
43+
// idle mode when appropriate. Must be created by NewManager.
4544
type Manager struct {
4645
// State accessed atomically.
4746
lastCallEndTime int64 // Unix timestamp in nanos; time when the most recent RPC completed.
@@ -51,8 +50,8 @@ type Manager struct {
5150

5251
// Can be accessed without atomics or mutex since these are set at creation
5352
// time and read-only after that.
54-
enforcer Enforcer // Functionality provided by grpc.ClientConn.
55-
timeout time.Duration
53+
cc ClientConn // Functionality provided by grpc.ClientConn.
54+
timeout time.Duration
5655

5756
// idleMu is used to guarantee mutual exclusion in two scenarios:
5857
// - Opposing intentions:
@@ -72,9 +71,9 @@ type Manager struct {
7271

7372
// NewManager creates a new idleness manager implementation for the
7473
// given idle timeout. It begins in idle mode.
75-
func NewManager(enforcer Enforcer, timeout time.Duration) *Manager {
74+
func NewManager(cc ClientConn, timeout time.Duration) *Manager {
7675
return &Manager{
77-
enforcer: enforcer,
76+
cc: cc,
7877
timeout: timeout,
7978
actuallyIdle: true,
8079
activeCallsCount: -math.MaxInt32,
@@ -127,7 +126,7 @@ func (m *Manager) handleIdleTimeout() {
127126

128127
// Now that we've checked that there has been no activity, attempt to enter
129128
// idle mode, which is very likely to succeed.
130-
if m.tryEnterIdleMode() {
129+
if m.tryEnterIdleMode(true) {
131130
// Successfully entered idle mode. No timer needed until we exit idle.
132131
return
133132
}
@@ -142,10 +141,13 @@ func (m *Manager) handleIdleTimeout() {
142141
// that, it performs a last minute check to ensure that no new RPC has come in,
143142
// making the channel active.
144143
//
144+
// checkActivity controls if a check for RPC activity, since the last time the
145+
// idle_timeout fired, is made.
146+
145147
// Return value indicates whether or not the channel moved to idle mode.
146148
//
147149
// Holds idleMu which ensures mutual exclusion with exitIdleMode.
148-
func (m *Manager) tryEnterIdleMode() bool {
150+
func (m *Manager) tryEnterIdleMode(checkActivity bool) bool {
149151
// Setting the activeCallsCount to -math.MaxInt32 indicates to OnCallBegin()
150152
// that the channel is either in idle mode or is trying to get there.
151153
if !atomic.CompareAndSwapInt32(&m.activeCallsCount, 0, -math.MaxInt32) {
@@ -166,7 +168,7 @@ func (m *Manager) tryEnterIdleMode() bool {
166168
atomic.AddInt32(&m.activeCallsCount, math.MaxInt32)
167169
return false
168170
}
169-
if atomic.LoadInt32(&m.activeSinceLastTimerCheck) == 1 {
171+
if checkActivity && atomic.LoadInt32(&m.activeSinceLastTimerCheck) == 1 {
170172
// A very short RPC could have come in (and also finished) after we
171173
// checked for calls count and activity in handleIdleTimeout(), but
172174
// before the CAS operation. So, we need to check for activity again.
@@ -177,44 +179,37 @@ func (m *Manager) tryEnterIdleMode() bool {
177179
// No new RPCs have come in since we set the active calls count value to
178180
// -math.MaxInt32. And since we have the lock, it is safe to enter idle mode
179181
// unconditionally now.
180-
m.enforcer.EnterIdleMode()
182+
m.cc.EnterIdleMode()
181183
m.actuallyIdle = true
182184
return true
183185
}
184186

185187
// EnterIdleModeForTesting instructs the channel to enter idle mode.
186188
func (m *Manager) EnterIdleModeForTesting() {
187-
m.tryEnterIdleMode()
189+
m.tryEnterIdleMode(false)
188190
}
189191

190192
// OnCallBegin is invoked at the start of every RPC.
191-
func (m *Manager) OnCallBegin() error {
193+
func (m *Manager) OnCallBegin() {
192194
if m.isClosed() {
193-
return nil
195+
return
194196
}
195197

196198
if atomic.AddInt32(&m.activeCallsCount, 1) > 0 {
197199
// Channel is not idle now. Set the activity bit and allow the call.
198200
atomic.StoreInt32(&m.activeSinceLastTimerCheck, 1)
199-
return nil
201+
return
200202
}
201203

202204
// Channel is either in idle mode or is in the process of moving to idle
203205
// mode. Attempt to exit idle mode to allow this RPC.
204-
if err := m.ExitIdleMode(); err != nil {
205-
// Undo the increment to calls count, and return an error causing the
206-
// RPC to fail.
207-
atomic.AddInt32(&m.activeCallsCount, -1)
208-
return err
209-
}
210-
206+
m.ExitIdleMode()
211207
atomic.StoreInt32(&m.activeSinceLastTimerCheck, 1)
212-
return nil
213208
}
214209

215-
// ExitIdleMode instructs m to call the enforcer's ExitIdleMode and update m's
210+
// ExitIdleMode instructs m to call the ClientConn's ExitIdleMode and update its
216211
// internal state.
217-
func (m *Manager) ExitIdleMode() error {
212+
func (m *Manager) ExitIdleMode() {
218213
// Holds idleMu which ensures mutual exclusion with tryEnterIdleMode.
219214
m.idleMu.Lock()
220215
defer m.idleMu.Unlock()
@@ -231,20 +226,34 @@ func (m *Manager) ExitIdleMode() error {
231226
// m.ExitIdleMode.
232227
//
233228
// In any case, there is nothing to do here.
234-
return nil
229+
return
235230
}
236231

237-
if err := m.enforcer.ExitIdleMode(); err != nil {
238-
return fmt.Errorf("failed to exit idle mode: %w", err)
239-
}
232+
m.cc.ExitIdleMode()
240233

241234
// Undo the idle entry process. This also respects any new RPC attempts.
242235
atomic.AddInt32(&m.activeCallsCount, math.MaxInt32)
243236
m.actuallyIdle = false
244237

245238
// Start a new timer to fire after the configured idle timeout.
246239
m.resetIdleTimerLocked(m.timeout)
247-
return nil
240+
}
241+
242+
// UnsafeSetNotIdle instructs the Manager to update its internal state to
243+
// reflect the reality that the channel is no longer in IDLE mode.
244+
//
245+
// N.B. This method is intended only for internal use by the gRPC client
246+
// when it exits IDLE mode **manually** from `Dial`. The callsite must ensure:
247+
// - The channel was **actually in IDLE mode** immediately prior to the call.
248+
// - There is **no concurrent activity** that could cause the channel to exit
249+
// IDLE mode *naturally* at the same time.
250+
func (m *Manager) UnsafeSetNotIdle() {
251+
m.idleMu.Lock()
252+
defer m.idleMu.Unlock()
253+
254+
atomic.AddInt32(&m.activeCallsCount, math.MaxInt32)
255+
m.actuallyIdle = false
256+
m.resetIdleTimerLocked(m.timeout)
248257
}
249258

250259
// OnCallEnd is invoked at the end of every RPC.

0 commit comments

Comments
 (0)