Skip to content

Refactor graceful manager, fix misused WaitGroup #29738

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion modules/graceful/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,10 @@ func (g *Manager) setStateTransition(old, new state) bool {
// At the moment the total number of servers (numberOfServersToCreate) are pre-defined as a const before global init,
// so this function MUST be called if a server is not used.
func (g *Manager) InformCleanup() {
g.createServerWaitGroup.Done()
g.createServerCond.L.Lock()
defer g.createServerCond.L.Unlock()
g.createdServer++
g.createServerCond.Signal()
}

// Done allows the manager to be viewed as a context.Context, it returns a channel that is closed when the server is finished terminating
Expand Down
5 changes: 3 additions & 2 deletions modules/graceful/manager_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,9 @@ type Manager struct {
terminateCtxCancel context.CancelFunc
managerCtxCancel context.CancelFunc
runningServerWaitGroup sync.WaitGroup
createServerWaitGroup sync.WaitGroup
terminateWaitGroup sync.WaitGroup
createServerCond sync.Cond
createdServer int
shutdownRequested chan struct{}

toRunAtShutdown []func()
Expand All @@ -52,7 +53,7 @@ type Manager struct {

func newGracefulManager(ctx context.Context) *Manager {
manager := &Manager{ctx: ctx, shutdownRequested: make(chan struct{})}
manager.createServerWaitGroup.Add(numberOfServersToCreate)
manager.createServerCond.L = &sync.Mutex{}
manager.prepare(ctx)
manager.start()
return manager
Expand Down
44 changes: 21 additions & 23 deletions modules/graceful/manager_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,37 +57,35 @@ func (g *Manager) start() {
// Handle clean up of unused provided listeners and delayed start-up
startupDone := make(chan struct{})
go func() {
defer close(startupDone)
// Wait till we're done getting all the listeners and then close the unused ones
func() {
// FIXME: there is a fundamental design problem of the "manager" and the "wait group".
// If nothing has started, the "Wait" just panics: sync: WaitGroup is reused before previous Wait has returned
// There is no clear solution besides a complete rewriting of the "manager"
defer func() {
_ = recover()
}()
g.createServerWaitGroup.Wait()
defer func() {
close(startupDone)
// Close the unused listeners and ignore the error here there's not much we can do with it, they're logged in the CloseProvidedListeners function
_ = CloseProvidedListeners()
}()
// Ignore the error here there's not much we can do with it, they're logged in the CloseProvidedListeners function
_ = CloseProvidedListeners()
g.notify(readyMsg)
// Wait for all servers to be created
g.createServerCond.L.Lock()
for {
if g.createdServer >= numberOfServersToCreate {
g.createServerCond.L.Unlock()
g.notify(readyMsg)
return
}
select {
case <-g.IsShutdown():
g.createServerCond.L.Unlock()
return
default:
}
g.createServerCond.Wait()
}
}()
if setting.StartupTimeout > 0 {
go func() {
select {
case <-startupDone:
return
case <-g.IsShutdown():
func() {
// When WaitGroup counter goes negative it will panic - we don't care about this so we can just ignore it.
defer func() {
_ = recover()
}()
// Ensure that the createServerWaitGroup stops waiting
for {
g.createServerWaitGroup.Done()
}
}()
g.createServerCond.Signal()
return
case <-time.After(setting.StartupTimeout):
log.Error("Startup took too long! Shutting down")
Expand Down
52 changes: 27 additions & 25 deletions modules/graceful/manager_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,33 +149,35 @@ hammerLoop:
func (g *Manager) awaitServer(limit time.Duration) bool {
c := make(chan struct{})
go func() {
defer close(c)
func() {
// FIXME: there is a fundamental design problem of the "manager" and the "wait group".
// If nothing has started, the "Wait" just panics: sync: WaitGroup is reused before previous Wait has returned
// There is no clear solution besides a complete rewriting of the "manager"
defer func() {
_ = recover()
}()
g.createServerWaitGroup.Wait()
}()
g.createServerCond.L.Lock()
for {
if g.createdServer >= numberOfServersToCreate {
g.createServerCond.L.Unlock()
close(c)
return
}
select {
case <-g.IsShutdown():
g.createServerCond.L.Unlock()
return
default:
}
g.createServerCond.Wait()
}
}()

var tc <-chan time.Time
if limit > 0 {
select {
case <-c:
return true // completed normally
case <-time.After(limit):
return false // timed out
case <-g.IsShutdown():
return false
}
} else {
select {
case <-c:
return true // completed normally
case <-g.IsShutdown():
return false
}
tc = time.After(limit)
}
select {
case <-c:
return true // completed normally
case <-tc:
return false // timed out
case <-g.IsShutdown():
g.createServerCond.Signal()
return false
}
}

Expand Down