Skip to content

Commit fa8f803

Browse files
authored
feat: add periodic stale-region-detection for ShardCoordinator (#32892)
* feat: add stale-region-detection safety net for ShardCoordinator * switch to a re-watch mechanism * docs: add paragraph for Stale region detection * Revert "docs: add paragraph for Stale region detection" This reverts commit b5b4564. * fix: use duration converters and use timer with fixed delay * add warning log for found stale regions
1 parent f84d844 commit fa8f803

File tree

3 files changed

+280
-1
lines changed

3 files changed

+280
-1
lines changed

akka-cluster-sharding/src/main/resources/reference.conf

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,27 @@ akka.cluster.sharding {
397397
# The tradeoff of increasing this is that coordinator startup will be slower.
398398
read-majority-plus = 5
399399
}
400-
400+
401+
# Periodic detection of stale regions by re-watching remote regions.
402+
# If a Terminated message from DeathWatch was lost or dropped, the coordinator
403+
# still believes shards live on a node that has left the cluster.
404+
# When enabled, the coordinator periodically unwatches and re-watches all
405+
# remote regions. If the node is gone, ClusterRemoteWatcher will immediately
406+
# deliver a new Terminated via its memberTombstones check. If the node is
407+
# alive, the watch is simply re-established (no-op).
408+
stale-region-detection {
409+
# Enable periodic stale region detection. Off by default.
410+
enabled = off
411+
412+
# How often to re-watch remote regions.
413+
check-interval = 30s
414+
415+
# Grace period after coordinator startup before starting checks.
416+
# During startup, watchStateActors() already handles stale regions,
417+
# and the cluster may still be converging.
418+
startup-grace-period = 300s
419+
}
420+
401421
# Settings for the Distributed Data replicator.
402422
# Same layout as akka.cluster.distributed-data.
403423
# The "role" of the distributed-data configuration is not used. The distributed-data

akka-cluster-sharding/src/main/scala/akka/cluster/sharding/ShardCoordinator.scala

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import scala.collection.immutable
1212
import scala.concurrent.ExecutionContext
1313
import scala.concurrent.Future
1414
import scala.concurrent.duration._
15+
import scala.jdk.DurationConverters._
1516
import scala.util.Success
1617

1718
import akka.actor._
@@ -604,6 +605,8 @@ object ShardCoordinator {
604605

605606
private final case class DelayedShardRegionTerminated(region: ActorRef)
606607

608+
private case object StaleRegionCheckTick
609+
607610
private final case class StopShardTimeout(requestId: UUID)
608611

609612
/**
@@ -823,6 +826,14 @@ abstract class ShardCoordinator(
823826
// each waiting actor together with a request identifier to clear out all waiting for one request on timeout
824827
var waitingForShardsToStop: Map[ShardId, Set[(ActorRef, UUID)]] = Map.empty
825828

829+
private val staleRegionDetectionConfig = {
830+
val c = context.system.settings.config.getConfig("akka.cluster.sharding.stale-region-detection")
831+
val enabled = c.getBoolean("enabled")
832+
val checkInterval = c.getDuration("check-interval").toScala
833+
val startupGracePeriod = c.getDuration("startup-grace-period").toScala
834+
(enabled, checkInterval, startupGracePeriod)
835+
}
836+
826837
import context.dispatcher
827838

828839
cluster.subscribe(
@@ -1141,6 +1152,19 @@ abstract class ShardCoordinator(
11411152
}
11421153
}
11431154

1155+
case StaleRegionCheckTick =>
1156+
// Re-watch all remote regions.
1157+
// If the node is gone, ClusterRemoteWatcher.addWatch() will immediately
1158+
// trigger a new Terminated via the memberTombstones check.
1159+
// If the node is alive, the watch is simply re-established (no-op).
1160+
state.regions.keys.foreach { ref =>
1161+
if (!ref.path.address.hasLocalScope) {
1162+
log.warning("{}: Stale region detection, re-watching region [{}]", typeName, ref)
1163+
context.unwatch(ref)
1164+
context.watch(ref)
1165+
}
1166+
}
1167+
11441168
case ShardCoordinator.Internal.Terminate =>
11451169
terminate()
11461170
}: Receive).orElse[Any, Unit](receiveTerminated)
@@ -1319,6 +1343,12 @@ abstract class ShardCoordinator(
13191343
// This is an optimization that makes it operational faster and reduces the
13201344
// amount of lost messages during startup.
13211345
context.system.scheduler.scheduleOnce(500.millis, self, StateInitialized)
1346+
1347+
// Start stale region detection after the startup grace period
1348+
val (srdEnabled, srdCheckInterval, srdStartupGrace) = staleRegionDetectionConfig
1349+
if (srdEnabled) {
1350+
timers.startTimerWithFixedDelay(StaleRegionCheckTick, StaleRegionCheckTick, srdStartupGrace, srdCheckInterval)
1351+
}
13221352
}
13231353

13241354
def stateInitialized(): Unit = {
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
/*
2+
* Copyright (C) 2009-2025 Lightbend Inc. <https://www.lightbend.com>
3+
*/
4+
5+
package akka.cluster.sharding
6+
7+
import scala.concurrent.Future
8+
import scala.concurrent.duration._
9+
10+
import com.typesafe.config.ConfigFactory
11+
12+
import akka.actor._
13+
import akka.cluster.Cluster
14+
import akka.cluster.MemberStatus
15+
import akka.cluster.ddata.LWWRegister
16+
import akka.cluster.ddata.LWWRegisterKey
17+
import akka.cluster.ddata.Replicator._
18+
import akka.cluster.sharding.ShardCoordinator.Internal._
19+
import akka.cluster.sharding.ShardRegion.ShardId
20+
import akka.testkit._
21+
22+
object StaleRegionDetectionSpec {
23+
24+
def config(enabled: Boolean = true) = ConfigFactory.parseString(s"""
25+
akka.actor.provider = cluster
26+
akka.remote.artery.canonical.port = 0
27+
akka.loglevel = DEBUG
28+
akka.loggers = ["akka.testkit.SilenceAllTestEventListener"]
29+
akka.cluster.sharding.verbose-debug-logging = on
30+
akka.cluster.sharding.updating-state-timeout = 60s
31+
akka.cluster.sharding.rebalance-interval = 120s
32+
akka.cluster.sharding.shard-start-timeout = 120s
33+
akka.cluster.min-nr-of-members = 1
34+
akka.cluster.sharding.stale-region-detection {
35+
enabled = $enabled
36+
check-interval = 500ms
37+
startup-grace-period = 0s
38+
}
39+
""")
40+
41+
class TestAllocationStrategy extends ShardCoordinator.ShardAllocationStrategy {
42+
@volatile var targetRegion: Option[ActorRef] = None
43+
44+
override def allocateShard(
45+
requester: ActorRef,
46+
shardId: ShardId,
47+
currentShardAllocations: Map[ActorRef, IndexedSeq[ShardId]]): Future[ActorRef] =
48+
Future.successful(targetRegion.getOrElse(currentShardAllocations.minBy(_._2.size)._1))
49+
50+
override def rebalance(
51+
currentShardAllocations: Map[ActorRef, IndexedSeq[ShardId]],
52+
rebalanceInProgress: Set[ShardId]): Future[Set[ShardId]] =
53+
Future.successful(Set.empty)
54+
}
55+
}
56+
57+
class StaleRegionDetectionSpec extends AkkaSpec(StaleRegionDetectionSpec.config()) with WithLogCapturing {
58+
59+
import StaleRegionDetectionSpec._
60+
61+
private type CoordinatorUpdate = Update[LWWRegister[State]]
62+
63+
private case class Fixture(coordinator: ActorRef, replicatorProbe: TestProbe, strategy: TestAllocationStrategy)
64+
65+
private var testCounter = 0
66+
67+
private def nextTypeName(): String = {
68+
testCounter += 1
69+
s"TestEntity$testCounter"
70+
}
71+
72+
override def atStartup(): Unit = {
73+
val cluster = Cluster(system)
74+
cluster.join(cluster.selfAddress)
75+
awaitAssert {
76+
cluster.readView.members.count(_.status == MemberStatus.Up) should ===(1)
77+
}
78+
}
79+
80+
private def createFixture(): Fixture = {
81+
val typeName = nextTypeName()
82+
val strategy = new TestAllocationStrategy
83+
val replicatorProbe = TestProbe()
84+
val coordinator = system.actorOf(
85+
ShardCoordinator.props(
86+
typeName,
87+
ClusterShardingSettings(system),
88+
strategy,
89+
replicatorProbe.ref,
90+
majorityMinCap = 0,
91+
rememberEntitiesStoreProvider = None))
92+
93+
// Bootstrap: respond to the initial ddata Get with empty state
94+
replicatorProbe.expectMsgType[Get[_]](5.seconds)
95+
replicatorProbe.reply(NotFound(LWWRegisterKey[State](s"${typeName}CoordinatorState"), None))
96+
replicatorProbe.expectNoMessage(100.millis)
97+
98+
Fixture(coordinator, replicatorProbe, strategy)
99+
}
100+
101+
private def completeNextUpdate(replicatorProbe: TestProbe): DomainEvent = {
102+
val update = replicatorProbe.expectMsgType[CoordinatorUpdate](5.seconds)
103+
val evt = update.request.get.asInstanceOf[DomainEvent]
104+
replicatorProbe.reply(UpdateSuccess(update.key, update.request))
105+
evt
106+
}
107+
108+
private def registerRegion(coordinator: ActorRef, replicatorProbe: TestProbe): TestProbe = {
109+
val region = TestProbe()
110+
coordinator.tell(Register(region.ref), region.ref)
111+
completeNextUpdate(replicatorProbe)
112+
region.expectMsgType[RegisterAck](5.seconds)
113+
region
114+
}
115+
116+
private def allocateShard(coordinator: ActorRef, shardId: String, replicatorProbe: TestProbe): ShardHome = {
117+
val probe = TestProbe()
118+
coordinator.tell(GetShardHome(shardId), probe.ref)
119+
completeNextUpdate(replicatorProbe)
120+
probe.expectMsgType[ShardHome](5.seconds)
121+
}
122+
123+
"StaleRegionDetection" must {
124+
125+
"detect and clean up a stale region via unwatch/rewatch" in {
126+
val f = createFixture()
127+
val Fixture(coordinator, replicatorProbe, strategy) = f
128+
129+
try {
130+
val regionA = registerRegion(coordinator, replicatorProbe)
131+
132+
strategy.targetRegion = Some(regionA.ref)
133+
allocateShard(coordinator, "s1", replicatorProbe)
134+
135+
// Stop regionA without DeathWatch delivering Terminated to coordinator
136+
// (simulating the bug where Terminated is lost)
137+
watch(regionA.ref)
138+
system.stop(regionA.ref)
139+
expectTerminated(regionA.ref, 5.seconds)
140+
141+
// Wait for stale region detection to kick in
142+
// startup-grace-period = 0s, check-interval = 500ms
143+
// The unwatch/rewatch cycle should trigger a new Terminated quickly
144+
val terminated = replicatorProbe.expectMsgType[CoordinatorUpdate](5.seconds)
145+
val evt = terminated.request.get.asInstanceOf[DomainEvent]
146+
evt shouldBe a[ShardRegionTerminated]
147+
evt.asInstanceOf[ShardRegionTerminated].region should ===(regionA.ref)
148+
} finally system.stop(coordinator)
149+
}
150+
151+
"not act on regions whose member is still in the cluster" in {
152+
val f = createFixture()
153+
val Fixture(coordinator, replicatorProbe, strategy) = f
154+
155+
try {
156+
val regionA = registerRegion(coordinator, replicatorProbe)
157+
158+
strategy.targetRegion = Some(regionA.ref)
159+
allocateShard(coordinator, "s1", replicatorProbe)
160+
161+
// regionA is local (same node), so its address hasLocalScope — never re-watched
162+
// Wait long enough for multiple check cycles
163+
replicatorProbe.expectNoMessage(3.seconds)
164+
} finally system.stop(coordinator)
165+
}
166+
}
167+
}
168+
169+
class StaleRegionDetectionDisabledSpec
170+
extends AkkaSpec(StaleRegionDetectionSpec.config(enabled = false))
171+
with WithLogCapturing {
172+
173+
import StaleRegionDetectionSpec._
174+
175+
private type CoordinatorUpdate = Update[LWWRegister[State]]
176+
177+
override def atStartup(): Unit = {
178+
val cluster = Cluster(system)
179+
cluster.join(cluster.selfAddress)
180+
awaitAssert {
181+
cluster.readView.members.count(_.status == MemberStatus.Up) should ===(1)
182+
}
183+
}
184+
185+
private def completeNextUpdate(replicatorProbe: TestProbe): DomainEvent = {
186+
val update = replicatorProbe.expectMsgType[CoordinatorUpdate](5.seconds)
187+
val evt = update.request.get.asInstanceOf[DomainEvent]
188+
replicatorProbe.reply(UpdateSuccess(update.key, update.request))
189+
evt
190+
}
191+
192+
"StaleRegionDetection when disabled" must {
193+
194+
"not schedule stale region check timer" in {
195+
val strategy = new TestAllocationStrategy
196+
val replicatorProbe = TestProbe()
197+
val coordinator = system.actorOf(
198+
ShardCoordinator.props(
199+
"DisabledEntity",
200+
ClusterShardingSettings(system),
201+
strategy,
202+
replicatorProbe.ref,
203+
majorityMinCap = 0,
204+
rememberEntitiesStoreProvider = None))
205+
206+
try {
207+
replicatorProbe.expectMsgType[Get[_]](5.seconds)
208+
replicatorProbe.reply(NotFound(LWWRegisterKey[State]("DisabledEntityCoordinatorState"), None))
209+
replicatorProbe.expectNoMessage(100.millis)
210+
211+
val region = TestProbe()
212+
coordinator.tell(Register(region.ref), region.ref)
213+
completeNextUpdate(replicatorProbe)
214+
region.expectMsgType[RegisterAck](5.seconds)
215+
216+
strategy.targetRegion = Some(region.ref)
217+
val probe = TestProbe()
218+
coordinator.tell(GetShardHome("s1"), probe.ref)
219+
completeNextUpdate(replicatorProbe)
220+
probe.expectMsgType[ShardHome](5.seconds)
221+
222+
// With detection disabled, no StaleRegionCheckTick timer should fire.
223+
// Wait longer than the check-interval (500ms) to verify no timer was scheduled.
224+
// Region stays alive — we only verify no timer-based activity occurs.
225+
replicatorProbe.expectNoMessage(2.seconds)
226+
} finally system.stop(coordinator)
227+
}
228+
}
229+
}

0 commit comments

Comments
 (0)