@@ -48,6 +48,8 @@ const (
48
48
defaultRenewDeadline = 10 * time .Second
49
49
defaultRetryPeriod = 2 * time .Second
50
50
51
+ defaultRunnableTearDownTimeout = 10 * time .Second
52
+
51
53
defaultReadinessEndpoint = "/readyz"
52
54
defaultLivenessEndpoint = "/healthz"
53
55
)
@@ -126,6 +128,9 @@ type controllerManager struct {
126
128
// It and `internalStop` should point to the same channel.
127
129
internalStopper chan <- struct {}
128
130
131
+ // stop procedure engaged. In other words, we should not add anything else to the manager
132
+ stopProcedureEngaged bool
133
+
129
134
startCache func (stop <- chan struct {}) error
130
135
131
136
// port is the port that the webhook server serves at.
@@ -148,12 +153,23 @@ type controllerManager struct {
148
153
// retryPeriod is the duration the LeaderElector clients should wait
149
154
// between tries of actions.
150
155
retryPeriod time.Duration
156
+
157
+ // waitForRunnable is holding the number of runnables currently running so that
158
+ // we can wait for them to exit before quitting the manager
159
+ waitForRunnable sync.WaitGroup
160
+
161
+ // runnableTearDownTimeout is the duration given to runnable to stop
162
+ // before the manager actually returns on stop.
163
+ runnableTearDownTimeout time.Duration
151
164
}
152
165
153
166
// Add sets dependencies on i, and adds it to the list of Runnables to start.
154
167
func (cm * controllerManager ) Add (r Runnable ) error {
155
168
cm .mu .Lock ()
156
169
defer cm .mu .Unlock ()
170
+ if cm .stopProcedureEngaged {
171
+ return fmt .Errorf ("can't accept new runnable as stop procedure is already engaged" )
172
+ }
157
173
158
174
// Set dependencies on the object
159
175
if err := cm .SetFields (r ); err != nil {
@@ -173,9 +189,7 @@ func (cm *controllerManager) Add(r Runnable) error {
173
189
174
190
if shouldStart {
175
191
// If already started, start the controller
176
- go func () {
177
- cm .errChan <- r .Start (cm .internalStop )
178
- }()
192
+ cm .startRunnable (r )
179
193
}
180
194
181
195
return nil
@@ -213,6 +227,9 @@ func (cm *controllerManager) SetFields(i interface{}) error {
213
227
func (cm * controllerManager ) AddHealthzCheck (name string , check healthz.Checker ) error {
214
228
cm .mu .Lock ()
215
229
defer cm .mu .Unlock ()
230
+ if cm .stopProcedureEngaged {
231
+ return fmt .Errorf ("can't accept new healthCheck as stop procedure is already engaged" )
232
+ }
216
233
217
234
if cm .healthzStarted {
218
235
return fmt .Errorf ("unable to add new checker because healthz endpoint has already been created" )
@@ -230,6 +247,9 @@ func (cm *controllerManager) AddHealthzCheck(name string, check healthz.Checker)
230
247
func (cm * controllerManager ) AddReadyzCheck (name string , check healthz.Checker ) error {
231
248
cm .mu .Lock ()
232
249
defer cm .mu .Unlock ()
250
+ if cm .stopProcedureEngaged {
251
+ return fmt .Errorf ("can't accept new ready check as stop procedure is already engaged" )
252
+ }
233
253
234
254
if cm .healthzStarted {
235
255
return fmt .Errorf ("unable to add new checker because readyz endpoint has already been created" )
@@ -350,8 +370,9 @@ func (cm *controllerManager) serveHealthProbes(stop <-chan struct{}) {
350
370
}
351
371
352
372
func (cm * controllerManager ) Start (stop <- chan struct {}) error {
353
- // join the passed-in stop channel as an upstream feeding into cm.internalStopper
354
- defer close (cm .internalStopper )
373
+ // This chan indicates that stop is complete, in other words all runnables have returned or timeout on stop request
374
+ stopComplete := make (chan struct {})
375
+ defer close (stopComplete )
355
376
356
377
// Metrics should be served whether the controller is leader or not.
357
378
// (If we don't serve metrics for non-leaders, prometheus will still scrape
@@ -370,6 +391,9 @@ func (cm *controllerManager) Start(stop <-chan struct{}) error {
370
391
if cm .resourceLock != nil {
371
392
err := cm .startLeaderElection ()
372
393
if err != nil {
394
+ if errStop := cm .engageStopProcedure (stopComplete ); errStop != nil {
395
+ log .Error (errStop , "some runnables could not be stopped after error occurred in startLeaderElection." )
396
+ }
373
397
return err
374
398
}
375
399
} else {
@@ -379,12 +403,52 @@ func (cm *controllerManager) Start(stop <-chan struct{}) error {
379
403
select {
380
404
case <- stop :
381
405
// We are done
382
- return nil
406
+ return cm . engageStopProcedure ( stopComplete )
383
407
case err := <- cm .errChan :
384
- // Error starting a controller
408
+ // Error starting or running a runnable
409
+ if errStop := cm .engageStopProcedure (stopComplete ); errStop != nil {
410
+ log .Error (errStop , "some runnables could not be stopped after error occurred starting/running the manager." )
411
+ }
385
412
return err
386
413
}
387
414
}
415
+ func (cm * controllerManager ) engageStopProcedure (stopComplete chan struct {}) error {
416
+ cm .mu .Lock ()
417
+ defer cm .mu .Unlock ()
418
+ cm .stopProcedureEngaged = true
419
+ close (cm .internalStopper )
420
+ go func () {
421
+ for {
422
+ select {
423
+ case err , ok := <- cm .errChan :
424
+ if ok {
425
+ log .Error (err , "error received after stop sequence was engaged" )
426
+ }
427
+ case <- stopComplete :
428
+ return
429
+ }
430
+ }
431
+ }()
432
+ return cm .waitForRunnableToEnd ()
433
+ }
434
+
435
+ func (cm * controllerManager ) waitForRunnableToEnd () error {
436
+ runnableTearDownTimer := time .NewTimer (cm .runnableTearDownTimeout )
437
+ defer runnableTearDownTimer .Stop ()
438
+ allStopped := make (chan struct {})
439
+
440
+ go func () {
441
+ cm .waitForRunnable .Wait ()
442
+ close (allStopped )
443
+ }()
444
+
445
+ select {
446
+ case <- allStopped :
447
+ return nil
448
+ case <- runnableTearDownTimer .C :
449
+ return fmt .Errorf ("not all runnables have stopped within the proposed delay of %s" , cm .runnableTearDownTimeout .String ())
450
+ }
451
+ }
388
452
389
453
func (cm * controllerManager ) startNonLeaderElectionRunnables () {
390
454
cm .mu .Lock ()
@@ -414,9 +478,7 @@ func (cm *controllerManager) startLeaderElectionRunnables() {
414
478
// Controllers block, but we want to return an error if any have an error starting.
415
479
// Write any Start errors to a channel so we can return them
416
480
ctrl := c
417
- go func () {
418
- cm .errChan <- ctrl .Start (cm .internalStop )
419
- }()
481
+ cm .startRunnable (ctrl )
420
482
}
421
483
422
484
cm .startedLeader = true
@@ -478,3 +540,13 @@ func (cm *controllerManager) startLeaderElection() (err error) {
478
540
go l .Run (ctx )
479
541
return nil
480
542
}
543
+
544
+ func (cm * controllerManager ) startRunnable (r Runnable ) {
545
+ cm .waitForRunnable .Add (1 )
546
+ go func () {
547
+ defer cm .waitForRunnable .Done ()
548
+ if err := r .Start (cm .internalStop ); err != nil {
549
+ cm .errChan <- err
550
+ }
551
+ }()
552
+ }
0 commit comments