Skip to content

Commit c2d2abc

Browse files
committed
retry when address already in use
Signed-off-by: Ryan Leung <rleungx@gmail.com>
1 parent e04c3dc commit c2d2abc

File tree

1 file changed

+89
-6
lines changed

1 file changed

+89
-6
lines changed

tests/cluster.go

Lines changed: 89 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ import (
4646
"github.com/tikv/pd/pkg/utils/keypath"
4747
"github.com/tikv/pd/pkg/utils/logutil"
4848
"github.com/tikv/pd/pkg/utils/syncutil"
49+
"github.com/tikv/pd/pkg/utils/tempurl"
4950
"github.com/tikv/pd/server"
5051
"github.com/tikv/pd/server/api"
5152
"github.com/tikv/pd/server/apiv2"
@@ -502,6 +503,9 @@ type TestCluster struct {
502503
}
503504
schedulingCluster *TestSchedulingCluster
504505
tsoCluster *TestTSOCluster
506+
// services and opts are stored for recreating servers on port conflicts
507+
services []string
508+
opts []ConfigOption
505509
}
506510

507511
// ConfigOption is used to define customize settings in test.
@@ -544,8 +548,10 @@ func createTestCluster(ctx context.Context, initialServerCount int, services []s
544548
servers[cfg.Name] = s
545549
}
546550
return &TestCluster{
547-
config: config,
548-
servers: servers,
551+
config: config,
552+
servers: servers,
553+
services: services,
554+
opts: opts,
549555
tsPool: struct {
550556
syncutil.Mutex
551557
pool map[uint64]struct{}
@@ -643,11 +649,88 @@ func RunServers(servers []*TestServer) error {
643649

644650
// RunInitialServers starts to run servers in InitialServers.
645651
func (c *TestCluster) RunInitialServers() error {
646-
servers := make([]*TestServer, 0, len(c.config.InitialServers))
647-
for _, conf := range c.config.InitialServers {
648-
servers = append(servers, c.GetServer(conf.Name))
652+
return c.RunInitialServersWithRetry(3)
653+
}
654+
655+
// RunInitialServersWithRetry starts to run servers with port conflict handling.
656+
func (c *TestCluster) RunInitialServersWithRetry(maxRetries int) error {
657+
var lastErr error
658+
for i := range maxRetries {
659+
servers := make([]*TestServer, 0, len(c.config.InitialServers))
660+
for _, conf := range c.config.InitialServers {
661+
servers = append(servers, c.GetServer(conf.Name))
662+
}
663+
664+
lastErr = RunServers(servers)
665+
if lastErr == nil {
666+
return nil
667+
}
668+
669+
// Check if it's a port conflict
670+
isPortConflict := strings.Contains(lastErr.Error(), "address already in use")
671+
672+
if isPortConflict {
673+
log.Warn("port conflict detected, recreating servers with new ports",
674+
zap.Int("attempt", i+1),
675+
zap.Int("maxRetries", maxRetries),
676+
zap.Error(lastErr))
677+
678+
// Stop and destroy all servers
679+
for _, s := range servers {
680+
if s.State() == Running {
681+
_ = s.Stop()
682+
}
683+
_ = s.Destroy()
684+
}
685+
686+
// Recreate servers with new ports
687+
for _, conf := range c.config.InitialServers {
688+
// Regenerate config to get new ports
689+
conf.ClientURLs = tempurl.Alloc()
690+
conf.PeerURLs = tempurl.Alloc()
691+
conf.AdvertiseClientURLs = conf.ClientURLs
692+
conf.AdvertisePeerURLs = conf.PeerURLs
693+
694+
// Use the original opts passed during cluster creation
695+
allOpts := append([]ConfigOption{WithGCTuner(false)}, c.opts...)
696+
serverConf, err := conf.Generate(allOpts...)
697+
if err != nil {
698+
return err
699+
}
700+
701+
// Use the original services passed during cluster creation
702+
s, err := NewTestServer(context.Background(), serverConf, c.services)
703+
if err != nil {
704+
return err
705+
}
706+
c.servers[conf.Name] = s
707+
}
708+
709+
// Wait before retry
710+
backoff := time.Duration(i+1) * 500 * time.Millisecond
711+
if backoff > 3*time.Second {
712+
backoff = 3 * time.Second
713+
}
714+
time.Sleep(backoff)
715+
continue
716+
}
717+
718+
// For non-port-conflict errors, use regular retry
719+
if strings.Contains(lastErr.Error(), "ErrStartEtcd") {
720+
log.Warn("etcd start failed, will retry", zap.Error(lastErr))
721+
for _, s := range servers {
722+
if s.State() == Running {
723+
_ = s.Stop()
724+
}
725+
}
726+
time.Sleep(100 * time.Millisecond)
727+
continue
728+
}
729+
730+
// For other errors, don't retry
731+
return lastErr
649732
}
650-
return RunServersWithRetry(servers, 3)
733+
return errors.Wrapf(lastErr, "failed to start servers after %d retries", maxRetries)
651734
}
652735

653736
// RunServersWithRetry starts to run multiple TestServer with retry logic.

0 commit comments

Comments
 (0)