@@ -48,6 +48,8 @@ const (
48
48
defaultRenewDeadline = 10 * time .Second
49
49
defaultRetryPeriod = 2 * time .Second
50
50
51
+ defaultRunnableTearDownTimeout = 10 * time .Second
52
+
51
53
defaultReadinessEndpoint = "/readyz"
52
54
defaultLivenessEndpoint = "/healthz"
53
55
)
@@ -114,11 +116,7 @@ type controllerManager struct {
114
116
started bool
115
117
startedLeader bool
116
118
healthzStarted bool
117
-
118
- // NB(directxman12): we don't just use an error channel here to avoid the situation where the
119
- // error channel is too small and we end up blocking some goroutines waiting to report their errors.
120
- // errSignal lets us track when we should stop because an error occurred
121
- errSignal * errSignaler
119
+ errChan chan error
122
120
123
121
// internalStop is the stop channel *actually* used by everything involved
124
122
// with the manager as a stop channel, so that we can pass a stop channel
@@ -130,6 +128,9 @@ type controllerManager struct {
130
128
// It and `internalStop` should point to the same channel.
131
129
internalStopper chan <- struct {}
132
130
131
+ // stop procedure engaged. In other words, we should not add anything else to the manager
132
+ stopProcedureEngaged bool
133
+
133
134
startCache func (stop <- chan struct {}) error
134
135
135
136
// port is the port that the webhook server serves at.
@@ -152,57 +153,23 @@ type controllerManager struct {
152
153
// retryPeriod is the duration the LeaderElector clients should wait
153
154
// between tries of actions.
154
155
retryPeriod time.Duration
155
- }
156
-
157
- type errSignaler struct {
158
- // errSignal indicates that an error occurred, when closed. It shouldn't
159
- // be written to.
160
- errSignal chan struct {}
161
-
162
- // err is the received error
163
- err error
164
-
165
- mu sync.Mutex
166
- }
167
-
168
- func (r * errSignaler ) SignalError (err error ) {
169
- r .mu .Lock ()
170
- defer r .mu .Unlock ()
171
-
172
- if err == nil {
173
- // non-error, ignore
174
- log .Error (nil , "SignalError called without an (with a nil) error, which should never happen, ignoring" )
175
- return
176
- }
177
-
178
- if r .err != nil {
179
- // we already have an error, don't try again
180
- return
181
- }
182
-
183
- // save the error and report it
184
- r .err = err
185
- close (r .errSignal )
186
- }
187
156
188
- func ( r * errSignaler ) Error () error {
189
- r . mu . Lock ()
190
- defer r . mu . Unlock ()
157
+ // waitForRunnable is holding the number of runnables currently running so that
158
+ // we can wait for them to exit before quitting the manager
159
+ waitForRunnable sync. WaitGroup
191
160
192
- return r .err
193
- }
194
-
195
- func (r * errSignaler ) GotError () chan struct {} {
196
- r .mu .Lock ()
197
- defer r .mu .Unlock ()
198
-
199
- return r .errSignal
161
+ // runnableTearDownTimeout is the duration given to runnable to stop
162
+ // before the manager actually returns on stop.
163
+ runnableTearDownTimeout time.Duration
200
164
}
201
165
202
166
// Add sets dependencies on i, and adds it to the list of Runnables to start.
203
167
func (cm * controllerManager ) Add (r Runnable ) error {
204
168
cm .mu .Lock ()
205
169
defer cm .mu .Unlock ()
170
+ if cm .stopProcedureEngaged {
171
+ return fmt .Errorf ("can't accept new runnable as stop procedure is already engaged" )
172
+ }
206
173
207
174
// Set dependencies on the object
208
175
if err := cm .SetFields (r ); err != nil {
@@ -222,11 +189,7 @@ func (cm *controllerManager) Add(r Runnable) error {
222
189
223
190
if shouldStart {
224
191
// If already started, start the controller
225
- go func () {
226
- if err := r .Start (cm .internalStop ); err != nil {
227
- cm .errSignal .SignalError (err )
228
- }
229
- }()
192
+ cm .startRunnable (r )
230
193
}
231
194
232
195
return nil
@@ -264,6 +227,9 @@ func (cm *controllerManager) SetFields(i interface{}) error {
264
227
func (cm * controllerManager ) AddHealthzCheck (name string , check healthz.Checker ) error {
265
228
cm .mu .Lock ()
266
229
defer cm .mu .Unlock ()
230
+ if cm .stopProcedureEngaged {
231
+ return fmt .Errorf ("can't accept new healthCheck as stop procedure is already engaged" )
232
+ }
267
233
268
234
if cm .healthzStarted {
269
235
return fmt .Errorf ("unable to add new checker because healthz endpoint has already been created" )
@@ -281,6 +247,9 @@ func (cm *controllerManager) AddHealthzCheck(name string, check healthz.Checker)
281
247
func (cm * controllerManager ) AddReadyzCheck (name string , check healthz.Checker ) error {
282
248
cm .mu .Lock ()
283
249
defer cm .mu .Unlock ()
250
+ if cm .stopProcedureEngaged {
251
+ return fmt .Errorf ("can't accept new ready check as stop procedure is already engaged" )
252
+ }
284
253
285
254
if cm .healthzStarted {
286
255
return fmt .Errorf ("unable to add new checker because readyz endpoint has already been created" )
@@ -355,15 +324,15 @@ func (cm *controllerManager) serveMetrics(stop <-chan struct{}) {
355
324
go func () {
356
325
log .Info ("starting metrics server" , "path" , metricsPath )
357
326
if err := server .Serve (cm .metricsListener ); err != nil && err != http .ErrServerClosed {
358
- cm .errSignal . SignalError ( err )
327
+ cm .errChan <- err
359
328
}
360
329
}()
361
330
362
331
// Shutdown the server when stop is closed
363
332
select {
364
333
case <- stop :
365
334
if err := server .Shutdown (context .Background ()); err != nil {
366
- cm .errSignal . SignalError ( err )
335
+ cm .errChan <- err
367
336
}
368
337
}
369
338
}
@@ -385,7 +354,7 @@ func (cm *controllerManager) serveHealthProbes(stop <-chan struct{}) {
385
354
// Run server
386
355
go func () {
387
356
if err := server .Serve (cm .healthProbeListener ); err != nil && err != http .ErrServerClosed {
388
- cm .errSignal . SignalError ( err )
357
+ cm .errChan <- err
389
358
}
390
359
}()
391
360
cm .healthzStarted = true
@@ -395,17 +364,18 @@ func (cm *controllerManager) serveHealthProbes(stop <-chan struct{}) {
395
364
select {
396
365
case <- stop :
397
366
if err := server .Shutdown (context .Background ()); err != nil {
398
- cm .errSignal . SignalError ( err )
367
+ cm .errChan <- err
399
368
}
400
369
}
401
370
}
402
371
403
372
func (cm * controllerManager ) Start (stop <- chan struct {}) error {
404
- // join the passed-in stop channel as an upstream feeding into cm.internalStopper
405
- defer close (cm .internalStopper )
373
+ // This chan indicates that stop is complete, in other words all runnables have returned or timeout on stop request
374
+ stopComplete := make (chan struct {})
375
+ defer close (stopComplete )
406
376
407
377
// initialize this here so that we reset the signal channel state on every start
408
- cm .errSignal = & errSignaler { errSignal : make (chan struct {})}
378
+ cm .errChan = make (chan error )
409
379
410
380
// Metrics should be served whether the controller is leader or not.
411
381
// (If we don't serve metrics for non-leaders, prometheus will still scrape
@@ -424,6 +394,9 @@ func (cm *controllerManager) Start(stop <-chan struct{}) error {
424
394
if cm .resourceLock != nil {
425
395
err := cm .startLeaderElection ()
426
396
if err != nil {
397
+ if errStop := cm .engageStopProcedure (stopComplete ); errStop != nil {
398
+ log .Error (errStop , "some runnables could not be stopped after error occurred in startLeaderElection." )
399
+ }
427
400
return err
428
401
}
429
402
} else {
@@ -433,10 +406,50 @@ func (cm *controllerManager) Start(stop <-chan struct{}) error {
433
406
select {
434
407
case <- stop :
435
408
// We are done
409
+ return cm .engageStopProcedure (stopComplete )
410
+ case err := <- cm .errChan :
411
+ // Error starting or running a runnable
412
+ if errStop := cm .engageStopProcedure (stopComplete ); errStop != nil {
413
+ log .Error (errStop , "some runnables could not be stopped after error occurred starting/running the manager." )
414
+ }
415
+ return err
416
+ }
417
+ }
418
+ func (cm * controllerManager ) engageStopProcedure (stopComplete chan struct {}) error {
419
+ cm .mu .Lock ()
420
+ defer cm .mu .Unlock ()
421
+ cm .stopProcedureEngaged = true
422
+ close (cm .internalStopper )
423
+ go func () {
424
+ for {
425
+ select {
426
+ case err , ok := <- cm .errChan :
427
+ if ok {
428
+ log .Error (err , "error received after stop sequence was engaged" )
429
+ }
430
+ case <- stopComplete :
431
+ return
432
+ }
433
+ }
434
+ }()
435
+ return cm .waitForRunnableToEnd ()
436
+ }
437
+
438
+ func (cm * controllerManager ) waitForRunnableToEnd () error {
439
+ runnableTearDownTimer := time .NewTimer (cm .runnableTearDownTimeout )
440
+ defer runnableTearDownTimer .Stop ()
441
+ allStopped := make (chan struct {})
442
+
443
+ go func () {
444
+ cm .waitForRunnable .Wait ()
445
+ close (allStopped )
446
+ }()
447
+
448
+ select {
449
+ case <- allStopped :
436
450
return nil
437
- case <- cm .errSignal .GotError ():
438
- // Error starting a controller
439
- return cm .errSignal .Error ()
451
+ case <- runnableTearDownTimer .C :
452
+ return fmt .Errorf ("not all runnables have stopped within the proposed delay of %s" , cm .runnableTearDownTimeout .String ())
440
453
}
441
454
}
442
455
@@ -453,7 +466,7 @@ func (cm *controllerManager) startNonLeaderElectionRunnables() {
453
466
ctrl := c
454
467
go func () {
455
468
if err := ctrl .Start (cm .internalStop ); err != nil {
456
- cm .errSignal . SignalError ( err )
469
+ cm .errChan <- err
457
470
}
458
471
// we use %T here because we don't have a good stand-in for "name",
459
472
// and the full runnable might not serialize (mutexes, etc)
@@ -473,14 +486,7 @@ func (cm *controllerManager) startLeaderElectionRunnables() {
473
486
// Controllers block, but we want to return an error if any have an error starting.
474
487
// Write any Start errors to a channel so we can return them
475
488
ctrl := c
476
- go func () {
477
- if err := ctrl .Start (cm .internalStop ); err != nil {
478
- cm .errSignal .SignalError (err )
479
- }
480
- // we use %T here because we don't have a good stand-in for "name",
481
- // and the full runnable might not serialize (mutexes, etc)
482
- log .V (1 ).Info ("leader-election runnable finished" , "runnable type" , fmt .Sprintf ("%T" , ctrl ))
483
- }()
489
+ cm .startRunnable (ctrl )
484
490
}
485
491
486
492
cm .startedLeader = true
@@ -497,7 +503,7 @@ func (cm *controllerManager) waitForCache() {
497
503
}
498
504
go func () {
499
505
if err := cm .startCache (cm .internalStop ); err != nil {
500
- cm .errSignal . SignalError ( err )
506
+ cm .errChan <- err
501
507
}
502
508
}()
503
509
@@ -521,7 +527,7 @@ func (cm *controllerManager) startLeaderElection() (err error) {
521
527
// Most implementations of leader election log.Fatal() here.
522
528
// Since Start is wrapped in log.Fatal when called, we can just return
523
529
// an error here which will cause the program to exit.
524
- cm .errSignal . SignalError ( fmt .Errorf ("leader election lost" ) )
530
+ cm .errChan <- fmt .Errorf ("leader election lost" )
525
531
},
526
532
},
527
533
})
@@ -542,3 +548,13 @@ func (cm *controllerManager) startLeaderElection() (err error) {
542
548
go l .Run (ctx )
543
549
return nil
544
550
}
551
+
552
+ func (cm * controllerManager ) startRunnable (r Runnable ) {
553
+ cm .waitForRunnable .Add (1 )
554
+ go func () {
555
+ defer cm .waitForRunnable .Done ()
556
+ if err := r .Start (cm .internalStop ); err != nil {
557
+ cm .errChan <- err
558
+ }
559
+ }()
560
+ }
0 commit comments