@@ -114,7 +114,11 @@ type controllerManager struct {
114
114
started bool
115
115
startedLeader bool
116
116
healthzStarted bool
117
- errChan chan error
117
+
118
+ // NB(directxman12): we don't just use an error channel here to avoid the situation where the
119
+ // error channel is too small and we end up blocking some goroutines waiting to report their errors.
120
+ // errSignal lets us track when we should stop because an error occurred
121
+ errSignal * errSignaler
118
122
119
123
// internalStop is the stop channel *actually* used by everything involved
120
124
// with the manager as a stop channel, so that we can pass a stop channel
@@ -150,6 +154,51 @@ type controllerManager struct {
150
154
retryPeriod time.Duration
151
155
}
152
156
157
+ type errSignaler struct {
158
+ // errSignal indicates that an error occurred, when closed. It shouldn't
159
+ // be written to.
160
+ errSignal chan struct {}
161
+
162
+ // err is the received error
163
+ err error
164
+
165
+ mu sync.Mutex
166
+ }
167
+
168
+ func (r * errSignaler ) SignalError (err error ) {
169
+ r .mu .Lock ()
170
+ defer r .mu .Unlock ()
171
+
172
+ if err == nil {
173
+ // non-error, ignore
174
+ log .Error (nil , "SignalError called without an (with a nil) error, which should never happen, ignoring" )
175
+ return
176
+ }
177
+
178
+ if r .err != nil {
179
+ // we already have an error, don't try again
180
+ return
181
+ }
182
+
183
+ // save the error and report it
184
+ r .err = err
185
+ close (r .errSignal )
186
+ }
187
+
188
+ func (r * errSignaler ) Error () error {
189
+ r .mu .Lock ()
190
+ defer r .mu .Unlock ()
191
+
192
+ return r .err
193
+ }
194
+
195
+ func (r * errSignaler ) GotError () chan struct {} {
196
+ r .mu .Lock ()
197
+ defer r .mu .Unlock ()
198
+
199
+ return r .errSignal
200
+ }
201
+
153
202
// Add sets dependencies on i, and adds it to the list of Runnables to start.
154
203
func (cm * controllerManager ) Add (r Runnable ) error {
155
204
cm .mu .Lock ()
@@ -174,7 +223,9 @@ func (cm *controllerManager) Add(r Runnable) error {
174
223
if shouldStart {
175
224
// If already started, start the controller
176
225
go func () {
177
- cm .errChan <- r .Start (cm .internalStop )
226
+ if err := r .Start (cm .internalStop ); err != nil {
227
+ cm .errSignal .SignalError (err )
228
+ }
178
229
}()
179
230
}
180
231
@@ -304,15 +355,15 @@ func (cm *controllerManager) serveMetrics(stop <-chan struct{}) {
304
355
go func () {
305
356
log .Info ("starting metrics server" , "path" , metricsPath )
306
357
if err := server .Serve (cm .metricsListener ); err != nil && err != http .ErrServerClosed {
307
- cm .errChan <- err
358
+ cm .errSignal . SignalError ( err )
308
359
}
309
360
}()
310
361
311
362
// Shutdown the server when stop is closed
312
363
select {
313
364
case <- stop :
314
365
if err := server .Shutdown (context .Background ()); err != nil {
315
- cm .errChan <- err
366
+ cm .errSignal . SignalError ( err )
316
367
}
317
368
}
318
369
}
@@ -334,7 +385,7 @@ func (cm *controllerManager) serveHealthProbes(stop <-chan struct{}) {
334
385
// Run server
335
386
go func () {
336
387
if err := server .Serve (cm .healthProbeListener ); err != nil && err != http .ErrServerClosed {
337
- cm .errChan <- err
388
+ cm .errSignal . SignalError ( err )
338
389
}
339
390
}()
340
391
cm .healthzStarted = true
@@ -344,7 +395,7 @@ func (cm *controllerManager) serveHealthProbes(stop <-chan struct{}) {
344
395
select {
345
396
case <- stop :
346
397
if err := server .Shutdown (context .Background ()); err != nil {
347
- cm .errChan <- err
398
+ cm .errSignal . SignalError ( err )
348
399
}
349
400
}
350
401
}
@@ -353,6 +404,9 @@ func (cm *controllerManager) Start(stop <-chan struct{}) error {
353
404
// join the passed-in stop channel as an upstream feeding into cm.internalStopper
354
405
defer close (cm .internalStopper )
355
406
407
+ // initialize this here so that we reset the signal channel state on every start
408
+ cm .errSignal = & errSignaler {errSignal : make (chan struct {})}
409
+
356
410
// Metrics should be served whether the controller is leader or not.
357
411
// (If we don't serve metrics for non-leaders, prometheus will still scrape
358
412
// the pod but will get a connection refused)
@@ -380,9 +434,9 @@ func (cm *controllerManager) Start(stop <-chan struct{}) error {
380
434
case <- stop :
381
435
// We are done
382
436
return nil
383
- case err := <- cm .errChan :
437
+ case <- cm .errSignal . GotError () :
384
438
// Error starting a controller
385
- return err
439
+ return cm . errSignal . Error ()
386
440
}
387
441
}
388
442
@@ -398,7 +452,9 @@ func (cm *controllerManager) startNonLeaderElectionRunnables() {
398
452
// Write any Start errors to a channel so we can return them
399
453
ctrl := c
400
454
go func () {
401
- cm .errChan <- ctrl .Start (cm .internalStop )
455
+ if err := ctrl .Start (cm .internalStop ); err != nil {
456
+ cm .errSignal .SignalError (err )
457
+ }
402
458
}()
403
459
}
404
460
}
@@ -415,7 +471,9 @@ func (cm *controllerManager) startLeaderElectionRunnables() {
415
471
// Write any Start errors to a channel so we can return them
416
472
ctrl := c
417
473
go func () {
418
- cm .errChan <- ctrl .Start (cm .internalStop )
474
+ if err := ctrl .Start (cm .internalStop ); err != nil {
475
+ cm .errSignal .SignalError (err )
476
+ }
419
477
}()
420
478
}
421
479
@@ -433,7 +491,7 @@ func (cm *controllerManager) waitForCache() {
433
491
}
434
492
go func () {
435
493
if err := cm .startCache (cm .internalStop ); err != nil {
436
- cm .errChan <- err
494
+ cm .errSignal . SignalError ( err )
437
495
}
438
496
}()
439
497
@@ -457,7 +515,7 @@ func (cm *controllerManager) startLeaderElection() (err error) {
457
515
// Most implementations of leader election log.Fatal() here.
458
516
// Since Start is wrapped in log.Fatal when called, we can just return
459
517
// an error here which will cause the program to exit.
460
- cm .errChan <- fmt .Errorf ("leader election lost" )
518
+ cm .errSignal . SignalError ( fmt .Errorf ("leader election lost" ) )
461
519
},
462
520
},
463
521
})
0 commit comments