|
15 | 15 | import Distributed |
16 | 16 | import Logging |
17 | 17 |
|
| 18 | +/// Implements ``LifecycleWatch`` semantics in presence of ``Node`` failures. |
| 19 | +/// |
| 20 | +/// Depends on a failure detector (e.g. SWIM) to actually detect a node failure, however once detected, |
| 21 | +/// it handles notifying all _local_ actors which have watched at least one actor the terminating node. |
| 22 | +/// |
| 23 | +/// ### Implementation |
| 24 | +/// In order to avoid every actor having to subscribe to cluster events and individually handle the relationship between those |
| 25 | +/// and individually watched actors, the watcher handles subscribing for cluster events on behalf of actors which watch |
| 26 | +/// other actors on remote nodes, and messages them upon a node becoming down. |
| 27 | +/// |
| 28 | +/// Actor which is notified automatically when a remote actor is `context.watch()`-ed. |
| 29 | +/// |
| 30 | +/// Allows manually mocking membership changes to trigger terminated notifications. |
18 | 31 | internal actor DistributedNodeDeathWatcher { |
| 32 | + // TODO(distributed): actually use this actor rather than the behavior |
| 33 | + |
19 | 34 | typealias ActorSystem = ClusterSystem |
20 | | - |
21 | | - var membership: Cluster.Membership = .empty |
22 | | - let log: Logger |
23 | | - |
24 | | - init(actorSystem: ActorSystem) { |
| 35 | + |
| 36 | + private let log: Logger |
| 37 | + |
| 38 | + private let selfNode: UniqueNode |
| 39 | + private var membership: Cluster.Membership = .empty |
| 40 | + |
| 41 | + /// Members which have been `removed` |
| 42 | + // TODO: clear after a few days, or some max count of nodes, use sorted set for this |
| 43 | + private var nodeTombstones: Set<UniqueNode> = [] |
| 44 | + |
| 45 | + /// Mapping between remote node, and actors which have watched some actors on given remote node. |
| 46 | + private var remoteWatchCallbacks: [UniqueNode: Set<WatcherAndCallback>] = [:] |
| 47 | + |
| 48 | + private var eventListenerTask: Task<Void, Error>? |
| 49 | + |
| 50 | + init(actorSystem: ActorSystem) async { |
25 | 51 | var log = actorSystem.log |
26 | | -// log.metadata["path"] = "CLU" |
27 | 52 | self.log = log |
| 53 | + self.selfNode = actorSystem.cluster.uniqueNode |
| 54 | + // initialized |
| 55 | + |
| 56 | + let events = actorSystem.cluster.events |
| 57 | + self.eventListenerTask = Task { |
| 58 | + for try await event in events { |
| 59 | + switch event { |
| 60 | + case .membershipChange(let change): |
| 61 | + self.membershipChanged(change) |
| 62 | + case .snapshot(let membership): |
| 63 | + let diff = Cluster.Membership._diff(from: .empty, to: membership) |
| 64 | + for change in diff.changes { |
| 65 | + self.membershipChanged(change) |
| 66 | + } |
| 67 | + case .leadershipChange, .reachabilityChange: |
| 68 | + break // ignore those, they don't affect downing |
| 69 | + } |
| 70 | + } |
| 71 | + } |
| 72 | + } |
| 73 | + |
| 74 | + func watchActor( |
| 75 | + on remoteNode: UniqueNode, |
| 76 | + by watcher: ClusterSystem.ActorID, |
| 77 | + whenTerminated nodeTerminatedFn: @escaping @Sendable (UniqueNode) async -> Void |
| 78 | + ) { |
| 79 | + guard !self.nodeTombstones.contains(remoteNode) else { |
| 80 | + // the system the watcher is attempting to watch has terminated before the watch has been processed, |
| 81 | + // thus we have to immediately reply with a termination system message, as otherwise it would never receive one |
| 82 | + Task { |
| 83 | + await nodeTerminatedFn(remoteNode) |
| 84 | + } |
| 85 | + return |
| 86 | + } |
| 87 | + |
| 88 | + let record = WatcherAndCallback(watcherID: watcher, callback: nodeTerminatedFn) |
| 89 | + self.remoteWatchCallbacks[remoteNode, default: []].insert(record) |
| 90 | + } |
| 91 | + |
| 92 | + func removeWatcher(id: ClusterSystem.ActorID) { |
| 93 | + // TODO: this can be optimized a bit more I suppose, with a reverse lookup table |
| 94 | + let removeMe = WatcherAndCallback(watcherID: id, callback: { _ in () }) |
| 95 | + for (node, var watcherAndCallbacks) in self.remoteWatchCallbacks { |
| 96 | + if watcherAndCallbacks.remove(removeMe) != nil { |
| 97 | + self.remoteWatchCallbacks[node] = watcherAndCallbacks |
| 98 | + } |
| 99 | + } |
| 100 | + } |
| 101 | + |
| 102 | + func cleanupTombstone(node: UniqueNode) { |
| 103 | + _ = self.nodeTombstones.remove(node) |
| 104 | + } |
| 105 | + |
| 106 | + func membershipChanged(_ change: Cluster.MembershipChange) { |
| 107 | + guard let change = self.membership.applyMembershipChange(change) else { |
| 108 | + return // no change, nothing to act on |
| 109 | + } |
| 110 | + |
| 111 | + // TODO: make sure we only handle ONCE? |
| 112 | + if change.status >= .down { |
| 113 | + // can be: down, leaving or removal. |
| 114 | + // on any of those we want to ensure we handle the "down" |
| 115 | + self.handleAddressDown(change) |
| 116 | + } |
| 117 | + } |
| 118 | + |
| 119 | + func handleAddressDown(_ change: Cluster.MembershipChange) { |
| 120 | + let terminatedNode = change.node |
| 121 | + |
| 122 | + if let watchers = self.remoteWatchCallbacks.removeValue(forKey: terminatedNode) { |
| 123 | + for watcher in watchers { |
| 124 | + Task { |
| 125 | + await watcher.callback(terminatedNode) |
| 126 | + } |
| 127 | + } |
| 128 | + } |
| 129 | + |
| 130 | + // we need to keep a tombstone, so we can immediately reply with a terminated, |
| 131 | + // in case another watch was just in progress of being made |
| 132 | + self.nodeTombstones.insert(terminatedNode) |
| 133 | + } |
| 134 | + |
| 135 | + func cancel() { |
| 136 | + self.eventListenerTask?.cancel() |
| 137 | + self.eventListenerTask = nil |
| 138 | + } |
| 139 | +} |
| 140 | + |
| 141 | +extension DistributedNodeDeathWatcher { |
| 142 | + struct WatcherAndCallback: Hashable { |
| 143 | + /// Address of the local watcher which had issued this watch |
| 144 | + let watcherID: ClusterSystem.ActorID |
| 145 | + let callback: @Sendable (UniqueNode) async -> Void |
| 146 | + |
| 147 | + func hash(into hasher: inout Hasher) { |
| 148 | + hasher.combine(self.watcherID) |
| 149 | + } |
| 150 | + |
| 151 | + static func == (lhs: WatcherAndCallback, rhs: WatcherAndCallback) -> Bool { |
| 152 | + lhs.watcherID == rhs.watcherID |
| 153 | + } |
28 | 154 | } |
29 | | - |
30 | | - |
31 | 155 | } |
0 commit comments