feat: add periodic stale-region-detection for ShardCoordinator (#32892)

janikdotzel · web-flow · commit fa8f803687e1 · 2026-02-20T15:47:13.000+01:00
* feat: add stale-region-detection safety net for ShardCoordinator * switch to a re-watch mechanism * docs: add paragraph for Stale region detection * Revert "docs: add paragraph for Stale region detection" This reverts commit b5b4564. * fix: use duration converters and use timer with fixed delay * add warning log for found stale regions
diff --git a/akka-cluster-sharding/src/main/resources/reference.conf b/akka-cluster-sharding/src/main/resources/reference.conf
@@ -397,7 +397,27 @@ akka.cluster.sharding {
     # The tradeoff of increasing this is that coordinator startup will be slower.
     read-majority-plus = 5
   }
-  
+
+  # Periodic detection of stale regions by re-watching remote regions.
+  # If a Terminated message from DeathWatch was lost or dropped, the coordinator
+  # still believes shards live on a node that has left the cluster.
+  # When enabled, the coordinator periodically unwatches and re-watches all
+  # remote regions. If the node is gone, ClusterRemoteWatcher will immediately
+  # deliver a new Terminated via its memberTombstones check. If the node is
+  # alive, the watch is simply re-established (no-op).
+  stale-region-detection {
+    # Enable periodic stale region detection. Off by default.
+    enabled = off
+
+    # How often to re-watch remote regions.
+    check-interval = 30s
+
+    # Grace period after coordinator startup before starting checks.
+    # During startup, watchStateActors() already handles stale regions,
+    # and the cluster may still be converging.
+    startup-grace-period = 300s
+  }
+
   # Settings for the Distributed Data replicator. 
   # Same layout as akka.cluster.distributed-data.
   # The "role" of the distributed-data configuration is not used. The distributed-data
diff --git a/akka-cluster-sharding/src/main/scala/akka/cluster/sharding/ShardCoordinator.scala b/akka-cluster-sharding/src/main/scala/akka/cluster/sharding/ShardCoordinator.scala
@@ -12,6 +12,7 @@ import scala.collection.immutable
 import scala.concurrent.ExecutionContext
 import scala.concurrent.Future
 import scala.concurrent.duration._
+import scala.jdk.DurationConverters._
 import scala.util.Success
 
 import akka.actor._
@@ -604,6 +605,8 @@ object ShardCoordinator {
 
   private final case class DelayedShardRegionTerminated(region: ActorRef)
 
+  private case object StaleRegionCheckTick
+
   private final case class StopShardTimeout(requestId: UUID)
 
   /**
@@ -823,6 +826,14 @@ abstract class ShardCoordinator(
   // each waiting actor together with a request identifier to clear out all waiting for one request on timeout
   var waitingForShardsToStop: Map[ShardId, Set[(ActorRef, UUID)]] = Map.empty
 
+  private val staleRegionDetectionConfig = {
+    val c = context.system.settings.config.getConfig("akka.cluster.sharding.stale-region-detection")
+    val enabled = c.getBoolean("enabled")
+    val checkInterval = c.getDuration("check-interval").toScala
+    val startupGracePeriod = c.getDuration("startup-grace-period").toScala
+    (enabled, checkInterval, startupGracePeriod)
+  }
+
   import context.dispatcher
 
   cluster.subscribe(
@@ -1141,6 +1152,19 @@ abstract class ShardCoordinator(
           }
         }
 
+      case StaleRegionCheckTick =>
+        // Re-watch all remote regions.
+        // If the node is gone, ClusterRemoteWatcher.addWatch() will immediately
+        // trigger a new Terminated via the memberTombstones check.
+        // If the node is alive, the watch is simply re-established (no-op).
+        state.regions.keys.foreach { ref =>
+          if (!ref.path.address.hasLocalScope) {
+            log.warning("{}: Stale region detection, re-watching region [{}]", typeName, ref)
+            context.unwatch(ref)
+            context.watch(ref)
+          }
+        }
+
       case ShardCoordinator.Internal.Terminate =>
         terminate()
     }: Receive).orElse[Any, Unit](receiveTerminated)
@@ -1319,6 +1343,12 @@ abstract class ShardCoordinator(
     // This is an optimization that makes it operational faster and reduces the
     // amount of lost messages during startup.
     context.system.scheduler.scheduleOnce(500.millis, self, StateInitialized)
+
+    // Start stale region detection after the startup grace period
+    val (srdEnabled, srdCheckInterval, srdStartupGrace) = staleRegionDetectionConfig
+    if (srdEnabled) {
+      timers.startTimerWithFixedDelay(StaleRegionCheckTick, StaleRegionCheckTick, srdStartupGrace, srdCheckInterval)
+    }
   }
 
   def stateInitialized(): Unit = {
diff --git a/akka-cluster-sharding/src/test/scala/akka/cluster/sharding/StaleRegionDetectionSpec.scala b/akka-cluster-sharding/src/test/scala/akka/cluster/sharding/StaleRegionDetectionSpec.scala
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2009-2025 Lightbend Inc. <https://www.lightbend.com>
+ */
+
+package akka.cluster.sharding
+
+import scala.concurrent.Future
+import scala.concurrent.duration._
+
+import com.typesafe.config.ConfigFactory
+
+import akka.actor._
+import akka.cluster.Cluster
+import akka.cluster.MemberStatus
+import akka.cluster.ddata.LWWRegister
+import akka.cluster.ddata.LWWRegisterKey
+import akka.cluster.ddata.Replicator._
+import akka.cluster.sharding.ShardCoordinator.Internal._
+import akka.cluster.sharding.ShardRegion.ShardId
+import akka.testkit._
+
+object StaleRegionDetectionSpec {
+
+  def config(enabled: Boolean = true) = ConfigFactory.parseString(s"""
+    akka.actor.provider = cluster
+    akka.remote.artery.canonical.port = 0
+    akka.loglevel = DEBUG
+    akka.loggers = ["akka.testkit.SilenceAllTestEventListener"]
+    akka.cluster.sharding.verbose-debug-logging = on
+    akka.cluster.sharding.updating-state-timeout = 60s
+    akka.cluster.sharding.rebalance-interval = 120s
+    akka.cluster.sharding.shard-start-timeout = 120s
+    akka.cluster.min-nr-of-members = 1
+    akka.cluster.sharding.stale-region-detection {
+      enabled = $enabled
+      check-interval = 500ms
+      startup-grace-period = 0s
+    }
+  """)
+
+  class TestAllocationStrategy extends ShardCoordinator.ShardAllocationStrategy {
+    @volatile var targetRegion: Option[ActorRef] = None
+
+    override def allocateShard(
+        requester: ActorRef,
+        shardId: ShardId,
+        currentShardAllocations: Map[ActorRef, IndexedSeq[ShardId]]): Future[ActorRef] =
+      Future.successful(targetRegion.getOrElse(currentShardAllocations.minBy(_._2.size)._1))
+
+    override def rebalance(
+        currentShardAllocations: Map[ActorRef, IndexedSeq[ShardId]],
+        rebalanceInProgress: Set[ShardId]): Future[Set[ShardId]] =
+      Future.successful(Set.empty)
+  }
+}
+
+class StaleRegionDetectionSpec extends AkkaSpec(StaleRegionDetectionSpec.config()) with WithLogCapturing {
+
+  import StaleRegionDetectionSpec._
+
+  private type CoordinatorUpdate = Update[LWWRegister[State]]
+
+  private case class Fixture(coordinator: ActorRef, replicatorProbe: TestProbe, strategy: TestAllocationStrategy)
+
+  private var testCounter = 0
+
+  private def nextTypeName(): String = {
+    testCounter += 1
+    s"TestEntity$testCounter"
+  }
+
+  override def atStartup(): Unit = {
+    val cluster = Cluster(system)
+    cluster.join(cluster.selfAddress)
+    awaitAssert {
+      cluster.readView.members.count(_.status == MemberStatus.Up) should ===(1)
+    }
+  }
+
+  private def createFixture(): Fixture = {
+    val typeName = nextTypeName()
+    val strategy = new TestAllocationStrategy
+    val replicatorProbe = TestProbe()
+    val coordinator = system.actorOf(
+      ShardCoordinator.props(
+        typeName,
+        ClusterShardingSettings(system),
+        strategy,
+        replicatorProbe.ref,
+        majorityMinCap = 0,
+        rememberEntitiesStoreProvider = None))
+
+    // Bootstrap: respond to the initial ddata Get with empty state
+    replicatorProbe.expectMsgType[Get[_]](5.seconds)
+    replicatorProbe.reply(NotFound(LWWRegisterKey[State](s"${typeName}CoordinatorState"), None))
+    replicatorProbe.expectNoMessage(100.millis)
+
+    Fixture(coordinator, replicatorProbe, strategy)
+  }
+
+  private def completeNextUpdate(replicatorProbe: TestProbe): DomainEvent = {
+    val update = replicatorProbe.expectMsgType[CoordinatorUpdate](5.seconds)
+    val evt = update.request.get.asInstanceOf[DomainEvent]
+    replicatorProbe.reply(UpdateSuccess(update.key, update.request))
+    evt
+  }
+
+  private def registerRegion(coordinator: ActorRef, replicatorProbe: TestProbe): TestProbe = {
+    val region = TestProbe()
+    coordinator.tell(Register(region.ref), region.ref)
+    completeNextUpdate(replicatorProbe)
+    region.expectMsgType[RegisterAck](5.seconds)
+    region
+  }
+
+  private def allocateShard(coordinator: ActorRef, shardId: String, replicatorProbe: TestProbe): ShardHome = {
+    val probe = TestProbe()
+    coordinator.tell(GetShardHome(shardId), probe.ref)
+    completeNextUpdate(replicatorProbe)
+    probe.expectMsgType[ShardHome](5.seconds)
+  }
+
+  "StaleRegionDetection" must {
+
+    "detect and clean up a stale region via unwatch/rewatch" in {
+      val f = createFixture()
+      val Fixture(coordinator, replicatorProbe, strategy) = f
+
+      try {
+        val regionA = registerRegion(coordinator, replicatorProbe)
+
+        strategy.targetRegion = Some(regionA.ref)
+        allocateShard(coordinator, "s1", replicatorProbe)
+
+        // Stop regionA without DeathWatch delivering Terminated to coordinator
+        // (simulating the bug where Terminated is lost)
+        watch(regionA.ref)
+        system.stop(regionA.ref)
+        expectTerminated(regionA.ref, 5.seconds)
+
+        // Wait for stale region detection to kick in
+        // startup-grace-period = 0s, check-interval = 500ms
+        // The unwatch/rewatch cycle should trigger a new Terminated quickly
+        val terminated = replicatorProbe.expectMsgType[CoordinatorUpdate](5.seconds)
+        val evt = terminated.request.get.asInstanceOf[DomainEvent]
+        evt shouldBe a[ShardRegionTerminated]
+        evt.asInstanceOf[ShardRegionTerminated].region should ===(regionA.ref)
+      } finally system.stop(coordinator)
+    }
+
+    "not act on regions whose member is still in the cluster" in {
+      val f = createFixture()
+      val Fixture(coordinator, replicatorProbe, strategy) = f
+
+      try {
+        val regionA = registerRegion(coordinator, replicatorProbe)
+
+        strategy.targetRegion = Some(regionA.ref)
+        allocateShard(coordinator, "s1", replicatorProbe)
+
+        // regionA is local (same node), so its address hasLocalScope — never re-watched
+        // Wait long enough for multiple check cycles
+        replicatorProbe.expectNoMessage(3.seconds)
+      } finally system.stop(coordinator)
+    }
+  }
+}
+
+class StaleRegionDetectionDisabledSpec
+    extends AkkaSpec(StaleRegionDetectionSpec.config(enabled = false))
+    with WithLogCapturing {
+
+  import StaleRegionDetectionSpec._
+
+  private type CoordinatorUpdate = Update[LWWRegister[State]]
+
+  override def atStartup(): Unit = {
+    val cluster = Cluster(system)
+    cluster.join(cluster.selfAddress)
+    awaitAssert {
+      cluster.readView.members.count(_.status == MemberStatus.Up) should ===(1)
+    }
+  }
+
+  private def completeNextUpdate(replicatorProbe: TestProbe): DomainEvent = {
+    val update = replicatorProbe.expectMsgType[CoordinatorUpdate](5.seconds)
+    val evt = update.request.get.asInstanceOf[DomainEvent]
+    replicatorProbe.reply(UpdateSuccess(update.key, update.request))
+    evt
+  }
+
+  "StaleRegionDetection when disabled" must {
+
+    "not schedule stale region check timer" in {
+      val strategy = new TestAllocationStrategy
+      val replicatorProbe = TestProbe()
+      val coordinator = system.actorOf(
+        ShardCoordinator.props(
+          "DisabledEntity",
+          ClusterShardingSettings(system),
+          strategy,
+          replicatorProbe.ref,
+          majorityMinCap = 0,
+          rememberEntitiesStoreProvider = None))
+
+      try {
+        replicatorProbe.expectMsgType[Get[_]](5.seconds)
+        replicatorProbe.reply(NotFound(LWWRegisterKey[State]("DisabledEntityCoordinatorState"), None))
+        replicatorProbe.expectNoMessage(100.millis)
+
+        val region = TestProbe()
+        coordinator.tell(Register(region.ref), region.ref)
+        completeNextUpdate(replicatorProbe)
+        region.expectMsgType[RegisterAck](5.seconds)
+
+        strategy.targetRegion = Some(region.ref)
+        val probe = TestProbe()
+        coordinator.tell(GetShardHome("s1"), probe.ref)
+        completeNextUpdate(replicatorProbe)
+        probe.expectMsgType[ShardHome](5.seconds)
+
+        // With detection disabled, no StaleRegionCheckTick timer should fire.
+        // Wait longer than the check-interval (500ms) to verify no timer was scheduled.
+        // Region stays alive — we only verify no timer-based activity occurs.
+        replicatorProbe.expectNoMessage(2.seconds)
+      } finally system.stop(coordinator)
+    }
+  }
+}