@@ -46,6 +46,7 @@ import (
4646 "github.com/tikv/pd/pkg/utils/keypath"
4747 "github.com/tikv/pd/pkg/utils/logutil"
4848 "github.com/tikv/pd/pkg/utils/syncutil"
49+ "github.com/tikv/pd/pkg/utils/tempurl"
4950 "github.com/tikv/pd/server"
5051 "github.com/tikv/pd/server/api"
5152 "github.com/tikv/pd/server/apiv2"
@@ -502,6 +503,9 @@ type TestCluster struct {
502503 }
503504 schedulingCluster * TestSchedulingCluster
504505 tsoCluster * TestTSOCluster
506+ // services and opts are stored for recreating servers on port conflicts
507+ services []string
508+ opts []ConfigOption
505509}
506510
507511// ConfigOption is used to define customize settings in test.
@@ -544,8 +548,10 @@ func createTestCluster(ctx context.Context, initialServerCount int, services []s
544548 servers [cfg .Name ] = s
545549 }
546550 return & TestCluster {
547- config : config ,
548- servers : servers ,
551+ config : config ,
552+ servers : servers ,
553+ services : services ,
554+ opts : opts ,
549555 tsPool : struct {
550556 syncutil.Mutex
551557 pool map [uint64 ]struct {}
@@ -643,11 +649,88 @@ func RunServers(servers []*TestServer) error {
643649
644650// RunInitialServers starts to run servers in InitialServers.
645651func (c * TestCluster ) RunInitialServers () error {
646- servers := make ([]* TestServer , 0 , len (c .config .InitialServers ))
647- for _ , conf := range c .config .InitialServers {
648- servers = append (servers , c .GetServer (conf .Name ))
652+ return c .RunInitialServersWithRetry (3 )
653+ }
654+
655+ // RunInitialServersWithRetry starts to run servers with port conflict handling.
656+ func (c * TestCluster ) RunInitialServersWithRetry (maxRetries int ) error {
657+ var lastErr error
658+ for i := range maxRetries {
659+ servers := make ([]* TestServer , 0 , len (c .config .InitialServers ))
660+ for _ , conf := range c .config .InitialServers {
661+ servers = append (servers , c .GetServer (conf .Name ))
662+ }
663+
664+ lastErr = RunServers (servers )
665+ if lastErr == nil {
666+ return nil
667+ }
668+
669+ // Check if it's a port conflict
670+ isPortConflict := strings .Contains (lastErr .Error (), "address already in use" )
671+
672+ if isPortConflict {
673+ log .Warn ("port conflict detected, recreating servers with new ports" ,
674+ zap .Int ("attempt" , i + 1 ),
675+ zap .Int ("maxRetries" , maxRetries ),
676+ zap .Error (lastErr ))
677+
678+ // Stop and destroy all servers
679+ for _ , s := range servers {
680+ if s .State () == Running {
681+ _ = s .Stop ()
682+ }
683+ _ = s .Destroy ()
684+ }
685+
686+ // Recreate servers with new ports
687+ for _ , conf := range c .config .InitialServers {
688+ // Regenerate config to get new ports
689+ conf .ClientURLs = tempurl .Alloc ()
690+ conf .PeerURLs = tempurl .Alloc ()
691+ conf .AdvertiseClientURLs = conf .ClientURLs
692+ conf .AdvertisePeerURLs = conf .PeerURLs
693+
694+ // Use the original opts passed during cluster creation
695+ allOpts := append ([]ConfigOption {WithGCTuner (false )}, c .opts ... )
696+ serverConf , err := conf .Generate (allOpts ... )
697+ if err != nil {
698+ return err
699+ }
700+
701+ // Use the original services passed during cluster creation
702+ s , err := NewTestServer (context .Background (), serverConf , c .services )
703+ if err != nil {
704+ return err
705+ }
706+ c .servers [conf .Name ] = s
707+ }
708+
709+ // Wait before retry
710+ backoff := time .Duration (i + 1 ) * 500 * time .Millisecond
711+ if backoff > 3 * time .Second {
712+ backoff = 3 * time .Second
713+ }
714+ time .Sleep (backoff )
715+ continue
716+ }
717+
718+ // For non-port-conflict errors, use regular retry
719+ if strings .Contains (lastErr .Error (), "ErrStartEtcd" ) {
720+ log .Warn ("etcd start failed, will retry" , zap .Error (lastErr ))
721+ for _ , s := range servers {
722+ if s .State () == Running {
723+ _ = s .Stop ()
724+ }
725+ }
726+ time .Sleep (100 * time .Millisecond )
727+ continue
728+ }
729+
730+ // For other errors, don't retry
731+ return lastErr
649732 }
650- return RunServersWithRetry ( servers , 3 )
733+ return errors . Wrapf ( lastErr , "failed to start servers after %d retries" , maxRetries )
651734}
652735
653736// RunServersWithRetry starts to run multiple TestServer with retry logic.
0 commit comments