akka · patriknw · May 22, 2025 · May 22, 2025 · May 22, 2025 · May 22, 2025
diff --git a/akka-cluster-sharding/src/main/scala/akka/cluster/sharding/ShardRegion.scala b/akka-cluster-sharding/src/main/scala/akka/cluster/sharding/ShardRegion.scala
@@ -746,10 +746,49 @@ private[akka] class ShardRegion(
 
   var coordinator: Option[ActorRef] = None
 
+  def reRegisterIfCoordinatorNotUp(): Unit =
+    if (coordinator.nonEmpty) {
+      val coordAddress = coordinator.get.path.address // safe: guarded by nonEmpty
+      val coordinatorStatus =
+        coordinator.flatMap { _ =>
+          membersByAge.find(_.address == coordAddress).map(_.status)
+        }
+
+      coordinatorStatus match {
+        case Some(MemberStatus.Up) => () // Do nothing
+        case Some(notUp) =>
+          if (log.isDebugEnabled)
+            log.debug(
+              "{}: Coordinator is on node with status [{}], proactively attempting to find next coordinator",
+              typeName,
+              notUp)
+
+          // For now, make one attempt to register with the oldest Up member we know about without forgetting
+          // about the current coordinator
+          // We only make one attempt to one candidate so as to not flood with registration messages
+          // Important since this is level-triggered (any membership change where the coordinator is on a
+          // not-up node) while registration is otherwise edge-triggered
+          coordinatorSelection.headOption.foreach(sendRegistrationMessage)
+
+          if (!timers.isTimerActive(RegisterRetry)) {
+            nextRegistrationDelay = initRegistrationDelay
+
+            scheduleNextRegistration()
+          }
+
+        case None =>
+          // coordinator is on node which has been removed... can this actually happen?
+          log.warning("{}: Coordinator was on removed node [{}], attempting to re-register", typeName, coordAddress)
+          coordinator = None
+          startRegistration()
+      }
+    }
+
   def changeMembers(newMembers: immutable.SortedSet[Member]): Unit = {
     val before = membersByAge.headOption
     val after = newMembers.headOption
     membersByAge = newMembers
+    // NB: equaliity check is on uniqueAddress, not status etc.
     if (before != after) {
       if (log.isDebugEnabled)
         log.debug(
@@ -759,7 +798,7 @@ private[akka] class ShardRegion(
           after.map(_.address).getOrElse(""))
       coordinator = None
       startRegistration()
-    }
+    } else reRegisterIfCoordinatorNotUp()
   }
 
   def receive: Receive = {
@@ -789,9 +828,13 @@ private[akka] class ShardRegion(
   def receiveClusterEvent(evt: ClusterDomainEvent): Unit = evt match {
     case MemberUp(m) =>
       addMember(m)
+
     case MemberLeft(m) =>
+      // updates the status in the set
       addMember(m)
+
     case MemberExited(m) =>
+      // updates the status in the set
       addMember(m)
 
     case MemberRemoved(m, _) =>
@@ -930,7 +973,7 @@ private[akka] class ShardRegion(
       if (coordinator.isEmpty) {
         register()
         scheduleNextRegistration()
-      }
+      } else reRegisterIfCoordinatorNotUp()
 
     case GracefulShutdown =>
       if (preparingForShutdown) {
@@ -1128,7 +1171,7 @@ private[akka] class ShardRegion(
 
   def register(): Unit = {
     val actorSelections = coordinatorSelection
-    actorSelections.foreach(_ ! registrationMessage)
+    actorSelections.foreach(sendRegistrationMessage)
     if (shardBuffers.nonEmpty && retryCount >= 5) {
       if (actorSelections.nonEmpty) {
         val coordinatorMessage =
@@ -1175,6 +1218,10 @@ private[akka] class ShardRegion(
     }
   }
 
+  def sendRegistrationMessage(selection: ActorSelection): Unit = {
+    selection ! registrationMessage
+  }
+
   def registrationMessage: Any =
     if (entityProps.isDefined) Register(self) else RegisterProxy(self)