+testkit improve testkit for lifecycle tests

ktoso · ktoso · commit e2f3041d52aa · 2022-06-14T16:10:47.000+09:00
diff --git a/Sources/DistributedActors/ActorLogging.swift b/Sources/DistributedActors/ActorLogging.swift
@@ -76,7 +76,7 @@ internal final class LoggingContext {
 extension Logger {
     /// Create a logger specific to this actor.
     public init<Act: DistributedActor>(actor: Act) where Act.ActorSystem == ClusterSystem {
-        var log = Logger(label: "\(actor.id)")
+        var log = actor.actorSystem.settings.logging.baseLogger
         log[metadataKey: "actor/path"] = "\(actor.id.path)"
         log[metadataKey: "actor/id"] = "\(actor.id)"
         self = log
diff --git a/Sources/DistributedActors/Cluster/DistributedNodeDeathWatcher.swift b/Sources/DistributedActors/Cluster/DistributedNodeDeathWatcher.swift
@@ -15,17 +15,141 @@
 import Distributed
 import Logging
 
+/// Implements ``LifecycleWatch`` semantics in presence of ``Node`` failures.
+///
+/// Depends on a failure detector (e.g. SWIM) to actually detect a node failure, however once detected,
+/// it handles notifying all _local_ actors which have watched at least one actor the terminating node.
+///
+/// ### Implementation
+/// In order to avoid every actor having to subscribe to cluster events and individually handle the relationship between those
+/// and individually watched actors, the watcher handles subscribing for cluster events on behalf of actors which watch
+/// other actors on remote nodes, and messages them upon a node becoming down.
+///
+/// Actor which is notified automatically when a remote actor is `context.watch()`-ed.
+///
+/// Allows manually mocking membership changes to trigger terminated notifications.
 internal actor DistributedNodeDeathWatcher {
+    // TODO(distributed): actually use this actor rather than the behavior
+
     typealias ActorSystem = ClusterSystem
-    
-    var membership: Cluster.Membership = .empty
-    let log: Logger
-    
-    init(actorSystem: ActorSystem) {
+
+    private let log: Logger
+
+    private let selfNode: UniqueNode
+    private var membership: Cluster.Membership = .empty
+
+    /// Members which have been `removed`
+    // TODO: clear after a few days, or some max count of nodes, use sorted set for this
+    private var nodeTombstones: Set<UniqueNode> = []
+
+    /// Mapping between remote node, and actors which have watched some actors on given remote node.
+    private var remoteWatchCallbacks: [UniqueNode: Set<WatcherAndCallback>] = [:]
+
+    private var eventListenerTask: Task<Void, Error>?
+
+    init(actorSystem: ActorSystem) async {
         var log = actorSystem.log
-//        log.metadata["path"] = "CLU"
         self.log = log
+        self.selfNode = actorSystem.cluster.uniqueNode
+        // initialized
+
+        let events = actorSystem.cluster.events
+        self.eventListenerTask = Task {
+            for try await event in events {
+                switch event {
+                case .membershipChange(let change):
+                    self.membershipChanged(change)
+                case .snapshot(let membership):
+                    let diff = Cluster.Membership._diff(from: .empty, to: membership)
+                    for change in diff.changes {
+                        self.membershipChanged(change)
+                    }
+                case .leadershipChange, .reachabilityChange:
+                    break // ignore those, they don't affect downing
+                }
+            }
+        }
+    }
+
+    func watchActor(
+        on remoteNode: UniqueNode,
+        by watcher: ClusterSystem.ActorID,
+        whenTerminated nodeTerminatedFn: @escaping @Sendable (UniqueNode) async -> Void
+    ) {
+        guard !self.nodeTombstones.contains(remoteNode) else {
+            // the system the watcher is attempting to watch has terminated before the watch has been processed,
+            // thus we have to immediately reply with a termination system message, as otherwise it would never receive one
+            Task {
+                await nodeTerminatedFn(remoteNode)
+            }
+            return
+        }
+
+        let record = WatcherAndCallback(watcherID: watcher, callback: nodeTerminatedFn)
+        self.remoteWatchCallbacks[remoteNode, default: []].insert(record)
+    }
+
+    func removeWatcher(id: ClusterSystem.ActorID) {
+        // TODO: this can be optimized a bit more I suppose, with a reverse lookup table
+        let removeMe = WatcherAndCallback(watcherID: id, callback: { _ in () })
+        for (node, var watcherAndCallbacks) in self.remoteWatchCallbacks {
+            if watcherAndCallbacks.remove(removeMe) != nil {
+                self.remoteWatchCallbacks[node] = watcherAndCallbacks
+            }
+        }
+    }
+
+    func cleanupTombstone(node: UniqueNode) {
+        _ = self.nodeTombstones.remove(node)
+    }
+
+    func membershipChanged(_ change: Cluster.MembershipChange) {
+        guard let change = self.membership.applyMembershipChange(change) else {
+            return // no change, nothing to act on
+        }
+
+        // TODO: make sure we only handle ONCE?
+        if change.status >= .down {
+            // can be: down, leaving or removal.
+            // on any of those we want to ensure we handle the "down"
+            self.handleAddressDown(change)
+        }
+    }
+
+    func handleAddressDown(_ change: Cluster.MembershipChange) {
+        let terminatedNode = change.node
+
+        if let watchers = self.remoteWatchCallbacks.removeValue(forKey: terminatedNode) {
+            for watcher in watchers {
+                Task {
+                    await watcher.callback(terminatedNode)
+                }
+            }
+        }
+
+        // we need to keep a tombstone, so we can immediately reply with a terminated,
+        // in case another watch was just in progress of being made
+        self.nodeTombstones.insert(terminatedNode)
+    }
+
+    func cancel() {
+        self.eventListenerTask?.cancel()
+        self.eventListenerTask = nil
+    }
+}
+
+extension DistributedNodeDeathWatcher {
+    struct WatcherAndCallback: Hashable {
+        /// Address of the local watcher which had issued this watch
+        let watcherID: ClusterSystem.ActorID
+        let callback: @Sendable (UniqueNode) async -> Void
+
+        func hash(into hasher: inout Hasher) {
+            hasher.combine(self.watcherID)
+        }
+
+        static func == (lhs: WatcherAndCallback, rhs: WatcherAndCallback) -> Bool {
+            lhs.watcherID == rhs.watcherID
+        }
     }
-    
-    
 }
diff --git a/Sources/DistributedActors/Cluster/NodeDeathWatcher.swift b/Sources/DistributedActors/Cluster/NodeDeathWatcher.swift
@@ -131,23 +131,22 @@ internal final class NodeDeathWatcherInstance: NodeDeathWatcher {
 
     func handleAddressDown(_ change: Cluster.MembershipChange) {
         let terminatedNode = change.node
-        
+
         // ref
         if let watchers = self.remoteWatchers.removeValue(forKey: terminatedNode) {
             for ref in watchers {
                 // we notify each actor that was watching this remote address
                 ref._sendSystemMessage(.nodeTerminated(terminatedNode))
             }
         }
-        
+
         if let watchers = self.remoteWatchCallbacks.removeValue(forKey: terminatedNode) {
             for watcher in watchers {
                 Task {
                     await watcher.callback(terminatedNode)
                 }
             }
         }
-        
 
         // we need to keep a tombstone, so we can immediately reply with a terminated,
         // in case another watch was just in progress of being made
@@ -203,7 +202,7 @@ enum NodeDeathWatcherShell {
                     for change in diff.changes {
                         instance.onMembershipChanged(change)
                     }
-                
+
                 case .membershipChange(let change) where change.isAtLeast(.down):
                     context.log.info("Node down: \(change)!")
                     instance.handleAddressDown(change)
diff --git a/Sources/DistributedActors/LifecycleMonitoring/LifecycleWatch.swift b/Sources/DistributedActors/LifecycleMonitoring/LifecycleWatch.swift
@@ -35,7 +35,6 @@ public protocol LifecycleWatch: DistributedActor where ActorSystem == ClusterSys
 // MARK: Lifecycle Watch API
 
 extension LifecycleWatch {
-
     /// Watch the `watchee` actor for termination, and trigger the `whenTerminated` callback when
     @available(*, deprecated, message: "Replaced with the much safer `watchTermination(of:)` paired with `actorTerminated(_:)`")
     public func watchTermination<Watchee>(
@@ -71,7 +70,7 @@ extension LifecycleWatch {
         guard let watch = self.actorSystem._getLifecycleWatch(watcher: self) else {
             return watchee
         }
-        
+
         watch.termination(of: watchee, whenTerminated: { id in
             try? await self.terminated(actor: id)
         }, file: file, line: line)
@@ -385,31 +384,27 @@ extension LifecycleWatchContainer {
         watchedID: ActorID,
         file: String = #file, line: UInt = #line
     ) {
-        pprint("[>>>>> /system/nodeDeathWatcher] subscribe...")
         self.nodeDeathWatcher?.tell( // different actor
             .remoteDistributedActorWatched(
                 remoteNode: watchedID.uniqueNode,
                 watcherID: self.watcherID,
                 nodeTerminated: { [weak self, system] uniqueNode in
-                    pprint("[>>>>> /system/nodeDeathWatcher] callback watched: \(watchedID)...")
-                    
                     guard let self else {
                         return
                     }
-                    
+
                     Task {
                         self.receiveNodeTerminated(uniqueNode)
                     }
-                    
+
                     guard let system = system else {
                         return
                     }
-                    
+
                     guard let myselfRef = system._resolveUntyped(context: .init(id: self.watcherID, system: system)) else {
                         return
                     }
                     myselfRef._sendSystemMessage(.nodeTerminated(uniqueNode), file: file, line: line)
-                    
                 }
             )
         )
diff --git a/Sources/DistributedActorsTestKit/Cluster/DistributedActor+Assertions.swift b/Sources/DistributedActorsTestKit/Cluster/DistributedActor+Assertions.swift
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 import Distributed
 import DistributedActors
 import XCTest
diff --git a/Sources/DistributedActorsTestKit/ShouldMatchers.swift b/Sources/DistributedActorsTestKit/ShouldMatchers.swift
@@ -559,6 +559,18 @@ extension CallSiteInfo {
 
         return CallSiteError(callSite: self, explained: message)
     }
+
+    /// Prepares a detailed error, specialized for a prefix mismatch of a string
+    ///
+    /// - Warning: Performs file IO in order to read source location line where failure happened
+    func notMatchingPrefixError(got it: any StringProtocol, expected: any StringProtocol, failTest: Bool = true) -> CallSiteError {
+        let padding = String(repeating: " ", count: "[error]".count)
+        return self.error("""
+        [\(it)]
+        does start with expected prefix:
+        \(padding)[\(expected)]\n
+        """, failTest: failTest)
+    }
 }
 
 /// An error type with additional ``CallSiteInfo`` which is able to pretty print failures.
diff --git a/Sources/DistributedActorsTestKit/TestProbes.swift b/Sources/DistributedActorsTestKit/TestProbes.swift
@@ -377,6 +377,26 @@ extension ActorTestProbe where Message: Equatable {
     }
 }
 
+extension ActorTestProbe where Message: StringProtocol {
+    public func expectMessage(prefix: Message, file: StaticString = #file, line: UInt = #line, column: UInt = #column) throws {
+        try self.expectMessage(prefix: prefix, within: self.expectationTimeout, file: file, line: line, column: column)
+    }
+
+    public func expectMessage(prefix: Message, within timeout: Duration, file: StaticString = #file, line: UInt = #line, column: UInt = #column) throws {
+        let callSite = CallSiteInfo(file: file, line: line, column: column, function: #function)
+        do {
+            let receivedMessage = try self.receiveMessage(within: timeout)
+            self.lastMessage = receivedMessage
+            guard receivedMessage.starts(with: prefix) else {
+                throw callSite.notMatchingPrefixError(got: receivedMessage, expected: prefix)
+            }
+        } catch {
+            let message = "Did not receive String message with prefix [\(prefix)] within [\(timeout.prettyDescription)], error: \(error)"
+            throw callSite.error(message)
+        }
+    }
+}
+
 // ==== ----------------------------------------------------------------------------------------------------------------
 // MARK: Expecting multiple messages
 
diff --git a/Tests/DistributedActorsTests/LifecycleWatchTests.swift b/Tests/DistributedActorsTests/LifecycleWatchTests.swift

Original file line number	Diff line number	Diff line change
`@@ -131,23 +131,22 @@ internal final class NodeDeathWatcherInstance: NodeDeathWatcher {`
`131`	`131`
`132`	`132`	`func handleAddressDown(_ change: Cluster.MembershipChange) {`
`133`	`133`	`let terminatedNode = change.node`
`134`		`-`
	`134`	`+`
`135`	`135`	`// ref`
`136`	`136`	`if let watchers = self.remoteWatchers.removeValue(forKey: terminatedNode) {`
`137`	`137`	`for ref in watchers {`
`138`	`138`	`// we notify each actor that was watching this remote address`
`139`	`139`	`ref._sendSystemMessage(.nodeTerminated(terminatedNode))`
`140`	`140`	`}`
`141`	`141`	`}`
`142`		`-`
	`142`	`+`
`143`	`143`	`if let watchers = self.remoteWatchCallbacks.removeValue(forKey: terminatedNode) {`
`144`	`144`	`for watcher in watchers {`
`145`	`145`	`Task {`
`146`	`146`	`await watcher.callback(terminatedNode)`
`147`	`147`	`}`
`148`	`148`	`}`
`149`	`149`	`}`
`150`		`-`
`151`	`150`
`152`	`151`	`// we need to keep a tombstone, so we can immediately reply with a terminated,`
`153`	`152`	`// in case another watch was just in progress of being made`
`@@ -203,7 +202,7 @@ enum NodeDeathWatcherShell {`
`203`	`202`	`for change in diff.changes {`
`204`	`203`	`instance.onMembershipChanged(change)`
`205`	`204`	`}`
`206`		`-`
	`205`	`+`
`207`	`206`	`case .membershipChange(let change) where change.isAtLeast(.down):`
`208`	`207`	`context.log.info("Node down: \(change)!")`
`209`	`208`	`instance.handleAddressDown(change)`