Skip to content

Commit 8dd1207

Browse files
committed
Removes unsafe "recover to empty" code.
This isn't safe because it would implicitly commit all outstanding log entries. The new Raft library already has logic to not start a vote if the current node isn't in the configuration, so this shoudn't be needed.
1 parent eb07996 commit 8dd1207

File tree

1 file changed

+12
-39
lines changed

1 file changed

+12
-39
lines changed

consul/server.go

Lines changed: 12 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,6 @@ type Server struct {
9494
// strong consistency.
9595
fsm *consulFSM
9696

97-
// left is true if we have attempted to leave the cluster.
98-
left bool
99-
10097
// localConsuls is used to track the known consuls
10198
// in the local datacenter. Used to do leader forwarding.
10299
localConsuls map[raft.ServerAddress]*agent.Server
@@ -106,17 +103,13 @@ type Server struct {
106103
logger *log.Logger
107104

108105
// The raft instance is used among Consul nodes within the DC to protect
109-
// operations that require strong consistency. The raftSafeFn will get
110-
// called on a graceful leave to help "safe" the state of the server so
111-
// it won't interfere with other servers. This will be called after Raft
112-
// is shutdown, but before the state store is closed, so it can manipulate
106+
// operations that require strong consistency.
113107
// the state directly.
114108
raft *raft.Raft
115109
raftLayer *RaftLayer
116110
raftStore *raftboltdb.BoltStore
117111
raftTransport *raft.NetworkTransport
118112
raftInmem *raft.InmemStore
119-
raftSafeFn func() error
120113

121114
// reconcileCh is used to pass events from the serf handler
122115
// into the leader manager, so that the strong state can be
@@ -466,31 +459,6 @@ Please see https://www.consul.io/docs/guides/outage.html for more information.
466459
}
467460
s.logger.Printf("[INFO] consul: deleted peers.json file after successful recovery")
468461
}
469-
470-
// Register a cleanup function to call when a leave occurs. This
471-
// needs state that we only have access to here. This does a
472-
// recover operation to an empty configuration so this server
473-
// won't interfere with the rest of the cluster.
474-
s.raftSafeFn = func() error {
475-
hasState, err := raft.HasExistingState(log, stable, snap)
476-
if err != nil {
477-
return fmt.Errorf("cleanup failed to check for state: %v", err)
478-
}
479-
if !hasState {
480-
return nil
481-
}
482-
483-
tmpFsm, err := NewFSM(s.tombstoneGC, s.config.LogOutput)
484-
if err != nil {
485-
return fmt.Errorf("cleanup failed to make temp FSM: %v", err)
486-
}
487-
if err := raft.RecoverCluster(s.config.RaftConfig, tmpFsm,
488-
log, stable, snap, trans, raft.Configuration{}); err != nil {
489-
return fmt.Errorf("recovery failed: %v", err)
490-
}
491-
492-
return nil
493-
}
494462
}
495463

496464
// If we are in bootstrap or dev mode and the state is clean then we can
@@ -614,11 +582,6 @@ func (s *Server) Shutdown() error {
614582
if err := future.Error(); err != nil {
615583
s.logger.Printf("[WARN] consul: error shutting down raft: %s", err)
616584
}
617-
if s.left && s.raftSafeFn != nil {
618-
if err := s.raftSafeFn(); err != nil {
619-
s.logger.Printf("[WARN] consul: error safing raft: %s", err)
620-
}
621-
}
622585
if s.raftStore != nil {
623586
s.raftStore.Close()
624587
}
@@ -637,7 +600,6 @@ func (s *Server) Shutdown() error {
637600
// Leave is used to prepare for a graceful shutdown of the server
638601
func (s *Server) Leave() error {
639602
s.logger.Printf("[INFO] consul: server starting leave")
640-
s.left = true
641603

642604
// Check the number of known peers
643605
numPeers, err := s.numPeers()
@@ -703,6 +665,17 @@ func (s *Server) Leave() error {
703665
}
704666
}
705667

668+
// TODO (slackpad) With the old Raft library we used to force the
669+
// peers set to empty when a graceful leave occurred. This would
670+
// keep voting spam down if the server was restarted, but it was
671+
// dangerous because the peers was inconsistent with the logs and
672+
// snapshots, so it wasn't really safe in all cases for the server
673+
// to become leader. This is now safe, but the log spam is noisy.
674+
// The next new version of the library will have a "you are not a
675+
// peer stop it" behavior that should address this. We will have
676+
// to evaluate during the RC period if this interim situation is
677+
// not too confusing for operators.
678+
706679
// TODO (slackpad) When we take a later new version of the Raft
707680
// library it won't try to complete replication, so this peer
708681
// may not realize that it has been removed. Need to revisit this

0 commit comments

Comments
 (0)