Skip to content

Commit c013ca0

Browse files
committed
Wait for Runnables to end before returning in manager Start
1 parent ad57a97 commit c013ca0

File tree

3 files changed

+230
-52
lines changed

3 files changed

+230
-52
lines changed

pkg/manager/internal.go

Lines changed: 82 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ const (
4848
defaultRenewDeadline = 10 * time.Second
4949
defaultRetryPeriod = 2 * time.Second
5050

51+
defaultRunnableTearDownTimeout = 10 * time.Second
52+
5153
defaultReadinessEndpoint = "/readyz"
5254
defaultLivenessEndpoint = "/healthz"
5355
)
@@ -126,6 +128,9 @@ type controllerManager struct {
126128
// It and `internalStop` should point to the same channel.
127129
internalStopper chan<- struct{}
128130

131+
// stop procedure engaged. In other words, we should not add anything else to the manager
132+
stopProcedureEngaged bool
133+
129134
startCache func(stop <-chan struct{}) error
130135

131136
// port is the port that the webhook server serves at.
@@ -148,12 +153,23 @@ type controllerManager struct {
148153
// retryPeriod is the duration the LeaderElector clients should wait
149154
// between tries of actions.
150155
retryPeriod time.Duration
156+
157+
// waitForRunnable is holding the number of runnables currently running so that
158+
// we can wait for them to exit before quitting the manager
159+
waitForRunnable sync.WaitGroup
160+
161+
// runnableTearDownTimeout is the duration given to runnable to stop
162+
// before the manager actually returns on stop.
163+
runnableTearDownTimeout time.Duration
151164
}
152165

153166
// Add sets dependencies on i, and adds it to the list of Runnables to start.
154167
func (cm *controllerManager) Add(r Runnable) error {
155168
cm.mu.Lock()
156169
defer cm.mu.Unlock()
170+
if cm.stopProcedureEngaged {
171+
return fmt.Errorf("can't accept new runnable as stop procedure is already engaged")
172+
}
157173

158174
// Set dependencies on the object
159175
if err := cm.SetFields(r); err != nil {
@@ -173,9 +189,7 @@ func (cm *controllerManager) Add(r Runnable) error {
173189

174190
if shouldStart {
175191
// If already started, start the controller
176-
go func() {
177-
cm.errChan <- r.Start(cm.internalStop)
178-
}()
192+
cm.startRunnable(r)
179193
}
180194

181195
return nil
@@ -213,6 +227,9 @@ func (cm *controllerManager) SetFields(i interface{}) error {
213227
func (cm *controllerManager) AddHealthzCheck(name string, check healthz.Checker) error {
214228
cm.mu.Lock()
215229
defer cm.mu.Unlock()
230+
if cm.stopProcedureEngaged {
231+
return fmt.Errorf("can't accept new healthCheck as stop procedure is already engaged")
232+
}
216233

217234
if cm.healthzStarted {
218235
return fmt.Errorf("unable to add new checker because healthz endpoint has already been created")
@@ -230,6 +247,9 @@ func (cm *controllerManager) AddHealthzCheck(name string, check healthz.Checker)
230247
func (cm *controllerManager) AddReadyzCheck(name string, check healthz.Checker) error {
231248
cm.mu.Lock()
232249
defer cm.mu.Unlock()
250+
if cm.stopProcedureEngaged {
251+
return fmt.Errorf("can't accept new ready check as stop procedure is already engaged")
252+
}
233253

234254
if cm.healthzStarted {
235255
return fmt.Errorf("unable to add new checker because readyz endpoint has already been created")
@@ -350,8 +370,9 @@ func (cm *controllerManager) serveHealthProbes(stop <-chan struct{}) {
350370
}
351371

352372
func (cm *controllerManager) Start(stop <-chan struct{}) error {
353-
// join the passed-in stop channel as an upstream feeding into cm.internalStopper
354-
defer close(cm.internalStopper)
373+
// This chan indicates that stop is complete, in other words all runnables have returned or timeout on stop request
374+
stopComplete := make(chan struct{})
375+
defer close(stopComplete)
355376

356377
// Metrics should be served whether the controller is leader or not.
357378
// (If we don't serve metrics for non-leaders, prometheus will still scrape
@@ -370,6 +391,9 @@ func (cm *controllerManager) Start(stop <-chan struct{}) error {
370391
if cm.resourceLock != nil {
371392
err := cm.startLeaderElection()
372393
if err != nil {
394+
if errStop := cm.engageStopProcedure(stopComplete); errStop != nil {
395+
log.Error(errStop, "some runnables could not be stopped after error occurred in startLeaderElection.")
396+
}
373397
return err
374398
}
375399
} else {
@@ -379,12 +403,52 @@ func (cm *controllerManager) Start(stop <-chan struct{}) error {
379403
select {
380404
case <-stop:
381405
// We are done
382-
return nil
406+
return cm.engageStopProcedure(stopComplete)
383407
case err := <-cm.errChan:
384-
// Error starting a controller
408+
// Error starting or running a runnable
409+
if errStop := cm.engageStopProcedure(stopComplete); errStop != nil {
410+
log.Error(errStop, "some runnables could not be stopped after error occurred starting/running the manager.")
411+
}
385412
return err
386413
}
387414
}
415+
func (cm *controllerManager) engageStopProcedure(stopComplete chan struct{}) error {
416+
cm.mu.Lock()
417+
defer cm.mu.Unlock()
418+
cm.stopProcedureEngaged = true
419+
close(cm.internalStopper)
420+
go func() {
421+
for {
422+
select {
423+
case err, ok := <-cm.errChan:
424+
if ok {
425+
log.Error(err, "error received after stop sequence was engaged")
426+
}
427+
case <-stopComplete:
428+
return
429+
}
430+
}
431+
}()
432+
return cm.waitForRunnableToEnd()
433+
}
434+
435+
func (cm *controllerManager) waitForRunnableToEnd() error {
436+
runnableTearDownTimer := time.NewTimer(cm.runnableTearDownTimeout)
437+
defer runnableTearDownTimer.Stop()
438+
allStopped := make(chan struct{})
439+
440+
go func() {
441+
cm.waitForRunnable.Wait()
442+
close(allStopped)
443+
}()
444+
445+
select {
446+
case <-allStopped:
447+
return nil
448+
case <-runnableTearDownTimer.C:
449+
return fmt.Errorf("not all runnables have stopped within the proposed delay of %s", cm.runnableTearDownTimeout.String())
450+
}
451+
}
388452

389453
func (cm *controllerManager) startNonLeaderElectionRunnables() {
390454
cm.mu.Lock()
@@ -414,9 +478,7 @@ func (cm *controllerManager) startLeaderElectionRunnables() {
414478
// Controllers block, but we want to return an error if any have an error starting.
415479
// Write any Start errors to a channel so we can return them
416480
ctrl := c
417-
go func() {
418-
cm.errChan <- ctrl.Start(cm.internalStop)
419-
}()
481+
cm.startRunnable(ctrl)
420482
}
421483

422484
cm.startedLeader = true
@@ -478,3 +540,13 @@ func (cm *controllerManager) startLeaderElection() (err error) {
478540
go l.Run(ctx)
479541
return nil
480542
}
543+
544+
func (cm *controllerManager) startRunnable(r Runnable) {
545+
cm.waitForRunnable.Add(1)
546+
go func() {
547+
defer cm.waitForRunnable.Done()
548+
if err := r.Start(cm.internalStop); err != nil {
549+
cm.errChan <- err
550+
}
551+
}()
552+
}

pkg/manager/manager.go

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,10 @@ type Options struct {
185185
// Use this to customize the event correlator and spam filter
186186
EventBroadcaster record.EventBroadcaster
187187

188+
// RunnableTearDownTimeout is the duration given to runnable to stop
189+
// before the manager actually returns on stop.
190+
RunnableTearDownTimeout *time.Duration
191+
188192
// Dependency injection for testing
189193
newRecorderProvider func(config *rest.Config, scheme *runtime.Scheme, logger logr.Logger, broadcaster record.EventBroadcaster) (recorder.Provider, error)
190194
newResourceLock func(config *rest.Config, recorderProvider recorder.Provider, options leaderelection.Options) (resourcelock.Interface, error)
@@ -289,28 +293,29 @@ func New(config *rest.Config, options Options) (Manager, error) {
289293
stop := make(chan struct{})
290294

291295
return &controllerManager{
292-
config: config,
293-
scheme: options.Scheme,
294-
errChan: make(chan error),
295-
cache: cache,
296-
fieldIndexes: cache,
297-
client: writeObj,
298-
apiReader: apiReader,
299-
recorderProvider: recorderProvider,
300-
resourceLock: resourceLock,
301-
mapper: mapper,
302-
metricsListener: metricsListener,
303-
internalStop: stop,
304-
internalStopper: stop,
305-
port: options.Port,
306-
host: options.Host,
307-
certDir: options.CertDir,
308-
leaseDuration: *options.LeaseDuration,
309-
renewDeadline: *options.RenewDeadline,
310-
retryPeriod: *options.RetryPeriod,
311-
healthProbeListener: healthProbeListener,
312-
readinessEndpointName: options.ReadinessEndpointName,
313-
livenessEndpointName: options.LivenessEndpointName,
296+
config: config,
297+
scheme: options.Scheme,
298+
errChan: make(chan error),
299+
cache: cache,
300+
fieldIndexes: cache,
301+
client: writeObj,
302+
apiReader: apiReader,
303+
recorderProvider: recorderProvider,
304+
resourceLock: resourceLock,
305+
mapper: mapper,
306+
metricsListener: metricsListener,
307+
internalStop: stop,
308+
internalStopper: stop,
309+
port: options.Port,
310+
host: options.Host,
311+
certDir: options.CertDir,
312+
leaseDuration: *options.LeaseDuration,
313+
renewDeadline: *options.RenewDeadline,
314+
retryPeriod: *options.RetryPeriod,
315+
healthProbeListener: healthProbeListener,
316+
readinessEndpointName: options.ReadinessEndpointName,
317+
livenessEndpointName: options.LivenessEndpointName,
318+
runnableTearDownTimeout: *options.RunnableTearDownTimeout,
314319
}, nil
315320
}
316321

@@ -410,5 +415,10 @@ func setOptionsDefaults(options Options) Options {
410415
options.newHealthProbeListener = defaultHealthProbeListener
411416
}
412417

418+
if options.RunnableTearDownTimeout == nil {
419+
runnableTearDownTimeout := defaultRunnableTearDownTimeout
420+
options.RunnableTearDownTimeout = &runnableTearDownTimeout
421+
}
422+
413423
return options
414424
}

0 commit comments

Comments
 (0)