diff --git a/.cargo/config.toml b/.cargo/config.toml index 4d1a8e5e1..17cfb5135 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,6 +1,6 @@ [env] # This temporarily overrides the version of the CLI used for integration tests, locally and in CI -CLI_VERSION_OVERRIDE = "v1.4.1-cloud-v1-29-0-139-2.0" +#CLI_VERSION_OVERRIDE = "v1.4.1-cloud-v1-29-0-139-2.0" [alias] # Not sure why --all-features doesn't work diff --git a/client/src/lib.rs b/client/src/lib.rs index 8bdde001d..4d633aa3c 100644 --- a/client/src/lib.rs +++ b/client/src/lib.rs @@ -34,7 +34,9 @@ pub use temporal_sdk_core_protos::temporal::api::{ }, }; pub use tonic; -pub use worker_registry::{Slot, SlotManager, SlotProvider, WorkerKey}; +pub use worker_registry::{ + ClientWorker, ClientWorkerSet, HeartbeatCallback, SharedNamespaceWorkerTrait, Slot, +}; pub use workflow_handle::{ GetWorkflowResultOpts, WorkflowExecutionInfo, WorkflowExecutionResult, WorkflowHandle, }; @@ -390,7 +392,7 @@ pub struct ConfiguredClient { headers: Arc>, /// Capabilities as read from the `get_system_info` RPC call made on client connection capabilities: Option, - workers: Arc, + workers: Arc, } impl ConfiguredClient { @@ -440,9 +442,14 @@ impl ConfiguredClient { } /// Returns a cloned reference to a registry with workers using this client instance - pub fn workers(&self) -> Arc { + pub fn workers(&self) -> Arc { self.workers.clone() } + + /// Returns the worker grouping key, this should be unique across each client + pub fn worker_grouping_key(&self) -> Uuid { + self.workers.worker_grouping_key() + } } #[derive(Debug)] @@ -584,7 +591,7 @@ impl ClientOptions { client: TemporalServiceClient::new(svc), options: Arc::new(self.clone()), capabilities: None, - workers: Arc::new(SlotManager::new()), + workers: Arc::new(ClientWorkerSet::new()), }; if !self.skip_get_system_info { match client @@ -866,6 +873,11 @@ impl Client { pub fn into_inner(self) -> ConfiguredClient { self.inner } + + /// Returns the client-wide key + pub fn worker_grouping_key(&self) -> Uuid { + self.inner.worker_grouping_key() + } } impl NamespacedClient for Client { diff --git a/client/src/raw.rs b/client/src/raw.rs index c9fec24f1..655ad3b56 100644 --- a/client/src/raw.rs +++ b/client/src/raw.rs @@ -6,7 +6,7 @@ use crate::{ Client, ConfiguredClient, LONG_POLL_TIMEOUT, RequestExt, RetryClient, SharedReplaceableClient, TEMPORAL_NAMESPACE_HEADER_KEY, TemporalServiceClient, metrics::namespace_kv, - worker_registry::{Slot, SlotManager}, + worker_registry::{ClientWorkerSet, Slot}, }; use dyn_clone::DynClone; use futures_util::{FutureExt, TryFutureExt, future::BoxFuture}; @@ -33,7 +33,7 @@ use tonic::{ trait RawClientProducer { /// Returns information about workers associated with this client. Implementers outside of /// core can safely return `None`. - fn get_workers_info(&self) -> Option>; + fn get_workers_info(&self) -> Option>; /// Return a workflow service client instance fn workflow_client(&mut self) -> Box; @@ -175,7 +175,7 @@ impl RawClientProducer for RetryClient where RC: RawClientProducer + 'static, { - fn get_workers_info(&self) -> Option> { + fn get_workers_info(&self) -> Option> { self.get_client().get_workers_info() } @@ -253,7 +253,7 @@ impl RawClientProducer for SharedReplaceableClient where RC: RawClientProducer + Clone + Send + Sync + 'static, { - fn get_workers_info(&self) -> Option> { + fn get_workers_info(&self) -> Option> { self.inner_cow().get_workers_info() } fn workflow_client(&mut self) -> Box { @@ -284,7 +284,7 @@ impl RawGrpcCaller for SharedReplaceableClient where } impl RawClientProducer for TemporalServiceClient { - fn get_workers_info(&self) -> Option> { + fn get_workers_info(&self) -> Option> { None } @@ -312,7 +312,7 @@ impl RawClientProducer for TemporalServiceClient { impl RawGrpcCaller for TemporalServiceClient {} impl RawClientProducer for ConfiguredClient { - fn get_workers_info(&self) -> Option> { + fn get_workers_info(&self) -> Option> { Some(self.workers()) } @@ -340,7 +340,7 @@ impl RawClientProducer for ConfiguredClient { impl RawGrpcCaller for ConfiguredClient {} impl RawClientProducer for Client { - fn get_workers_info(&self) -> Option> { + fn get_workers_info(&self) -> Option> { self.inner.get_workers_info() } @@ -491,7 +491,7 @@ macro_rules! proxy_impl { mut request: tonic::Request<$req>, ) -> BoxFuture<'_, Result, tonic::Status>> { type_closure_arg(&mut request, $closure_request); - let data = type_closure_two_arg(&mut request, Option::>::None, + let data = type_closure_two_arg(&mut request, Option::>::None, $closure_before); async move { type_closure_two_arg(<$client_type<_>>::$method(self, request).await, @@ -1601,6 +1601,7 @@ mod tests { operatorservice::v1::DeleteNamespaceRequest, workflowservice::v1::ListNamespacesRequest, }; use tonic::IntoRequest; + use uuid::Uuid; // Just to help make sure some stuff compiles. Not run. #[allow(dead_code)] @@ -1791,7 +1792,7 @@ mod tests { #[case::without_versioning(false)] #[tokio::test] async fn eager_reservations_attach_deployment_options(#[case] use_worker_versioning: bool) { - use crate::worker_registry::{MockSlot, MockSlotProvider}; + use crate::worker_registry::{MockClientWorker, MockSlot}; use temporal_sdk_core_api::worker::{WorkerDeploymentOptions, WorkerDeploymentVersion}; use temporal_sdk_core_protos::temporal::api::enums::v1::WorkerVersioningMode; @@ -1803,13 +1804,13 @@ mod tests { #[derive(Clone)] struct MyFakeServices { - slot_manager: Arc, + client_worker_set: Arc, expected_mode: WorkerVersioningMode, } impl RawGrpcCaller for MyFakeServices {} impl RawClientProducer for MyFakeServices { - fn get_workers_info(&self) -> Option> { - Some(self.slot_manager.clone()) + fn get_workers_info(&self) -> Option> { + Some(self.client_worker_set.clone()) } fn workflow_client(&mut self) -> Box { Box::new(MyFakeWfClient { @@ -1839,7 +1840,7 @@ mod tests { default_versioning_behavior: None, }; - let mut mock_provider = MockSlotProvider::new(); + let mut mock_provider = MockClientWorker::new(); mock_provider .expect_namespace() .return_const("test-namespace".to_string()); @@ -1854,9 +1855,16 @@ mod tests { mock_provider .expect_deployment_options() .return_const(Some(deployment_opts.clone())); + mock_provider.expect_heartbeat_enabled().return_const(false); + let uuid = Uuid::new_v4(); + mock_provider + .expect_worker_instance_key() + .return_const(uuid); - let slot_manager = Arc::new(SlotManager::new()); - slot_manager.register(Box::new(mock_provider)); + let client_worker_set = Arc::new(ClientWorkerSet::new()); + client_worker_set + .register_worker(Arc::new(mock_provider), true) + .unwrap(); #[derive(Clone)] struct MyFakeWfClient { @@ -1886,7 +1894,7 @@ mod tests { } let mut mfs = MyFakeServices { - slot_manager, + client_worker_set, expected_mode, }; diff --git a/client/src/worker_registry/mod.rs b/client/src/worker_registry/mod.rs index 5fbc78333..67b034c22 100644 --- a/client/src/worker_registry/mod.rs +++ b/client/src/worker_registry/mod.rs @@ -2,29 +2,16 @@ //! This is needed to implement Eager Workflow Start, a latency optimization in which the client, //! after reserving a slot, directly forwards a WFT to a local worker. +use anyhow::bail; use parking_lot::RwLock; -use slotmap::SlotMap; -use std::collections::{HashMap, hash_map::Entry::Vacant}; - +use std::collections::{ + HashMap, + hash_map::Entry::{Occupied, Vacant}, +}; +use std::sync::Arc; +use temporal_sdk_core_protos::temporal::api::worker::v1::WorkerHeartbeat; use temporal_sdk_core_protos::temporal::api::workflowservice::v1::PollWorkflowTaskQueueResponse; - -slotmap::new_key_type! { - /// Registration key for a worker - pub struct WorkerKey; -} - -/// This trait is implemented by an object associated with a worker, which provides WFT processing slots. -#[cfg_attr(test, mockall::automock)] -pub trait SlotProvider: std::fmt::Debug { - /// The namespace for the WFTs that it can process. - fn namespace(&self) -> &str; - /// The task queue this provider listens to. - fn task_queue(&self) -> &str; - /// Try to reserve a slot on this worker. - fn try_reserve_wft_slot(&self) -> Option>; - /// Get the worker deployment options for this worker, if using deployment-based versioning. - fn deployment_options(&self) -> Option; -} +use uuid::Uuid; /// This trait represents a slot reserved for processing a WFT by a worker. #[cfg_attr(test, mockall::automock)] @@ -59,21 +46,23 @@ impl SlotKey { } } -/// This is an inner class for [SlotManager] needed to hide the mutex. -#[derive(Default, Debug)] -struct SlotManagerImpl { - /// Maps keys, i.e., namespace#task_queue, to provider. - providers: HashMap>, - /// Maps ids to keys in `providers`. - index: SlotMap, +/// This is an inner class for [ClientWorkerSet] needed to hide the mutex. +struct ClientWorkerSetImpl { + /// Maps slot keys to slot provider worker. + slot_providers: HashMap, + /// Maps worker_instance_key to registered workers + all_workers: HashMap>, + /// Maps namespace to shared worker for worker heartbeating + shared_worker: HashMap>, } -impl SlotManagerImpl { +impl ClientWorkerSetImpl { /// Factory method. fn new() -> Self { Self { - index: Default::default(), - providers: Default::default(), + slot_providers: Default::default(), + all_workers: Default::default(), + shared_worker: Default::default(), } } @@ -83,10 +72,11 @@ impl SlotManagerImpl { task_queue: String, ) -> Option { let key = SlotKey::new(namespace, task_queue); - if let Some(p) = self.providers.get(&key) - && let Some(slot) = p.try_reserve_wft_slot() + if let Some(p) = self.slot_providers.get(&key) + && let Some(worker) = self.all_workers.get(p) + && let Some(slot) = worker.try_reserve_wft_slot() { - let deployment_options = p.deployment_options(); + let deployment_options = worker.deployment_options(); return Some(SlotReservation { slot, deployment_options, @@ -95,47 +85,126 @@ impl SlotManagerImpl { None } - fn register(&mut self, provider: Box) -> Option { - let key = SlotKey::new( - provider.namespace().to_string(), - provider.task_queue().to_string(), + fn register( + &mut self, + worker: Arc, + skip_client_worker_set_check: bool, + ) -> Result<(), anyhow::Error> { + let slot_key = SlotKey::new( + worker.namespace().to_string(), + worker.task_queue().to_string(), ); - if let Vacant(p) = self.providers.entry(key.clone()) { - p.insert(provider); - Some(self.index.insert(key)) - } else { - warn!("Ignoring registration for worker: {key:?}."); - None + if self.slot_providers.contains_key(&slot_key) && !skip_client_worker_set_check { + bail!( + "Registration of multiple workers on the same namespace and task queue for the same client not allowed: {slot_key:?}, worker_instance_key: {:?}.", + worker.worker_instance_key() + ); } + + if worker.heartbeat_enabled() + && let Some(heartbeat_callback) = worker.heartbeat_callback() + { + let worker_instance_key = worker.worker_instance_key(); + let namespace = worker.namespace().to_string(); + + let shared_worker = match self.shared_worker.entry(namespace.clone()) { + Occupied(o) => o.into_mut(), + Vacant(v) => { + let shared_worker = worker.new_shared_namespace_worker()?; + v.insert(shared_worker) + } + }; + shared_worker.register_callback(worker_instance_key, heartbeat_callback); + } + + self.slot_providers + .insert(slot_key.clone(), worker.worker_instance_key()); + + self.all_workers + .insert(worker.worker_instance_key(), worker); + + Ok(()) } - fn unregister(&mut self, id: WorkerKey) -> Option> { - if let Some(key) = self.index.remove(id) { - self.providers.remove(&key) - } else { - None + fn unregister( + &mut self, + worker_instance_key: Uuid, + ) -> Result, anyhow::Error> { + let worker = self + .all_workers + .remove(&worker_instance_key) + .ok_or_else(|| { + anyhow::anyhow!("Worker with worker_instance_key {worker_instance_key} not found") + })?; + + let slot_key = SlotKey::new( + worker.namespace().to_string(), + worker.task_queue().to_string(), + ); + + self.slot_providers.remove(&slot_key); + + if let Some(w) = self.shared_worker.get_mut(worker.namespace()) { + let (callback, is_empty) = w.unregister_callback(worker.worker_instance_key()); + if callback.is_some() && is_empty { + self.shared_worker.remove(worker.namespace()); + } } + + Ok(worker) + } + + #[cfg(test)] + fn num_providers(&self) -> usize { + self.slot_providers.len() } #[cfg(test)] - fn num_providers(&self) -> (usize, usize) { - (self.index.len(), self.providers.len()) + fn num_heartbeat_workers(&self) -> usize { + self.shared_worker.values().map(|v| v.num_workers()).sum() } } +/// This trait represents a shared namespace worker that sends worker heartbeats and +/// receives worker commands. +pub trait SharedNamespaceWorkerTrait { + /// Namespace that the shared namespace worker is connected to. + fn namespace(&self) -> String; + + /// Registers a heartbeat callback. + fn register_callback(&self, worker_instance_key: Uuid, heartbeat_callback: HeartbeatCallback); + + /// Unregisters a heartbeat callback. Returns the callback removed, as well as a bool that + /// indicates if there are no remaining callbacks in the SharedNamespaceWorker, indicating + /// the shared worker itself can be shut down. + fn unregister_callback(&self, worker_instance_key: Uuid) -> (Option, bool); + + /// Returns the number of workers registered to this shared worker. + fn num_workers(&self) -> usize; +} + /// Enables local workers to make themselves visible to a shared client instance. -/// There can only be one worker registered per namespace+queue_name+client, others will get ignored. +/// +/// For slot managing, there can only be one worker registered per +/// namespace+queue_name+client, others will return an error. /// It also provides a convenient method to find compatible slots within the collection. -#[derive(Default, Debug)] -pub struct SlotManager { - manager: RwLock, +pub struct ClientWorkerSet { + worker_grouping_key: Uuid, + worker_manager: RwLock, } -impl SlotManager { +impl Default for ClientWorkerSet { + fn default() -> Self { + Self::new() + } +} + +impl ClientWorkerSet { /// Factory method. pub fn new() -> Self { Self { - manager: RwLock::new(SlotManagerImpl::new()), + worker_grouping_key: Uuid::new_v4(), + worker_manager: RwLock::new(ClientWorkerSetImpl::new()), } } @@ -146,29 +215,96 @@ impl SlotManager { namespace: String, task_queue: String, ) -> Option { - self.manager + self.worker_manager .read() .try_reserve_wft_slot(namespace, task_queue) } - /// Register a local worker that can provide WFT processing slots. - pub fn register(&self, provider: Box) -> Option { - self.manager.write().register(provider) + /// Register a local worker that can provide WFT processing slots and potentially worker heartbeating. + pub fn register_worker( + &self, + worker: Arc, + skip_client_worker_set_check: bool, + ) -> Result<(), anyhow::Error> { + self.worker_manager + .write() + .register(worker, skip_client_worker_set_check) + } + + /// Unregisters a local worker, typically when that worker starts shutdown. + pub fn unregister_worker( + &self, + worker_instance_key: Uuid, + ) -> Result, anyhow::Error> { + self.worker_manager.write().unregister(worker_instance_key) } - /// Unregister a provider, typically when its worker starts shutdown. - pub fn unregister(&self, id: WorkerKey) -> Option> { - self.manager.write().unregister(id) + /// Returns the worker grouping key, which is unique for each worker. + pub fn worker_grouping_key(&self) -> Uuid { + self.worker_grouping_key } #[cfg(test)] /// Returns (num_providers, num_buckets), where a bucket key is namespace+task_queue. /// There is only one provider per bucket so `num_providers` should be equal to `num_buckets`. - pub fn num_providers(&self) -> (usize, usize) { - self.manager.read().num_providers() + pub fn num_providers(&self) -> usize { + self.worker_manager.read().num_providers() + } + + #[cfg(test)] + /// Returns the total number of heartbeat workers registered across all namespaces. + pub fn num_heartbeat_workers(&self) -> usize { + self.worker_manager.read().num_heartbeat_workers() } } +impl std::fmt::Debug for ClientWorkerSet { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ClientWorkerSet") + .field("worker_grouping_key", &self.worker_grouping_key) + .finish() + } +} + +/// Contains a worker heartbeat callback, wrapped for mocking +pub type HeartbeatCallback = Arc WorkerHeartbeat + Send + Sync>; + +/// Represents a complete worker that can handle both slot management +/// and worker heartbeat functionality. +#[cfg_attr(test, mockall::automock)] +pub trait ClientWorker: Send + Sync { + /// The namespace this worker operates in + fn namespace(&self) -> &str; + + /// The task queue this worker listens to + fn task_queue(&self) -> &str; + + /// Try to reserve a slot for workflow task processing. + /// + /// This method should return `Some(slot)` if a workflow task slot is available, + /// or `None` if all slots are currently in use. The returned slot will be used + /// to process exactly one workflow task. + fn try_reserve_wft_slot(&self) -> Option>; + + /// Get the worker deployment options for this worker, if using deployment-based versioning. + fn deployment_options(&self) -> Option; + + /// Unique identifier for this worker instance. + /// This must be stable across the worker's lifetime and unique per instance. + fn worker_instance_key(&self) -> Uuid; + + /// Indicates if worker heartbeating is enabled for this client worker. + fn heartbeat_enabled(&self) -> bool; + + /// Returns the heartbeat callback that can be used to get WorkerHeartbeat data. + fn heartbeat_callback(&self) -> Option; + + /// Creates a new worker that implements the [SharedNamespaceWorkerTrait] + fn new_shared_namespace_worker( + &self, + ) -> Result, anyhow::Error>; +} + #[cfg(test)] mod tests { use super::*; @@ -190,8 +326,9 @@ mod tests { task_queue: String, with_error: bool, no_slots: bool, - ) -> MockSlotProvider { - let mut mock_provider = MockSlotProvider::new(); + heartbeat_enabled: bool, + ) -> MockClientWorker { + let mut mock_provider = MockClientWorker::new(); mock_provider .expect_try_reserve_wft_slot() .returning(move || { @@ -205,78 +342,316 @@ mod tests { mock_provider.expect_task_queue().return_const(task_queue); mock_provider.expect_deployment_options().return_const(None); mock_provider + .expect_heartbeat_enabled() + .return_const(heartbeat_enabled); + mock_provider + .expect_worker_instance_key() + .return_const(Uuid::new_v4()); + mock_provider } #[test] - fn registry_respects_registration_order() { - let mock_provider1 = - new_mock_provider("foo".to_string(), "bar_q".to_string(), false, false); - let mock_provider2 = new_mock_provider("foo".to_string(), "bar_q".to_string(), false, true); - - let manager = SlotManager::new(); - let some_slots = manager.register(Box::new(mock_provider1)); - let no_slots = manager.register(Box::new(mock_provider2)); - assert!(no_slots.is_none()); - - let mut found = 0; - for _ in 0..10 { - if manager - .try_reserve_wft_slot("foo".to_string(), "bar_q".to_string()) - .is_some() - { - found += 1; + fn registry_keeps_one_provider_per_namespace() { + let manager = ClientWorkerSet::new(); + let mut worker_keys = vec![]; + let mut successful_registrations = 0; + + for i in 0..10 { + let namespace = format!("myId{}", i % 3); + let mock_provider = + new_mock_provider(namespace, "bar_q".to_string(), false, false, false); + let worker_instance_key = mock_provider.worker_instance_key(); + + let result = manager.register_worker(Arc::new(mock_provider), false); + if result.is_ok() { + successful_registrations += 1; + worker_keys.push(worker_instance_key); + } else { + // Should get error for duplicate namespace+task_queue combinations + assert!(result.unwrap_err().to_string().contains( + "Registration of multiple workers on the same namespace and task queue" + )); } } - assert_eq!(found, 10); - assert_eq!((1, 1), manager.num_providers()); - - manager.unregister(some_slots.unwrap()); - assert_eq!((0, 0), manager.num_providers()); - - let mock_provider1 = - new_mock_provider("foo".to_string(), "bar_q".to_string(), false, false); - let mock_provider2 = new_mock_provider("foo".to_string(), "bar_q".to_string(), false, true); - - let no_slots = manager.register(Box::new(mock_provider2)); - let some_slots = manager.register(Box::new(mock_provider1)); - assert!(some_slots.is_none()); - - let mut not_found = 0; - for _ in 0..10 { - if manager - .try_reserve_wft_slot("foo".to_string(), "bar_q".to_string()) - .is_none() - { - not_found += 1; + + assert_eq!(successful_registrations, 3); + assert_eq!(3, manager.num_providers()); + + let count = worker_keys.iter().fold(0, |count, key| { + manager.unregister_worker(*key).unwrap(); + // expect error since worker is already unregistered + let result = manager.unregister_worker(*key); + assert!(result.is_err()); + count + 1 + }); + assert_eq!(3, count); + assert_eq!(0, manager.num_providers()); + } + + struct MockSharedNamespaceWorker { + namespace: String, + callbacks: Arc>>, + } + + impl std::fmt::Debug for MockSharedNamespaceWorker { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MockSharedNamespaceWorker") + .field("namespace", &self.namespace) + .field("callbacks_count", &self.callbacks.read().len()) + .finish() + } + } + + impl MockSharedNamespaceWorker { + fn new(namespace: String) -> Self { + Self { + namespace, + callbacks: Arc::new(RwLock::new(HashMap::new())), } } - assert_eq!(not_found, 10); - assert_eq!((1, 1), manager.num_providers()); - manager.unregister(no_slots.unwrap()); - assert_eq!((0, 0), manager.num_providers()); } - #[test] - fn registry_keeps_one_provider_per_namespace() { - let manager = SlotManager::new(); - let mut worker_keys = vec![]; - for i in 0..10 { - let namespace = format!("myId{}", i % 3); - let mock_provider = new_mock_provider(namespace, "bar_q".to_string(), false, false); - worker_keys.push(manager.register(Box::new(mock_provider))); + impl SharedNamespaceWorkerTrait for MockSharedNamespaceWorker { + fn namespace(&self) -> String { + self.namespace.clone() } - assert_eq!((3, 3), manager.num_providers()); - - let count = worker_keys - .iter() - .filter(|key| key.is_some()) - .fold(0, |count, key| { - manager.unregister(key.unwrap()); - // Should be idempotent - manager.unregister(key.unwrap()); - count + 1 - }); - assert_eq!(3, count); - assert_eq!((0, 0), manager.num_providers()); + + fn register_callback( + &self, + worker_instance_key: Uuid, + heartbeat_callback: HeartbeatCallback, + ) { + self.callbacks + .write() + .insert(worker_instance_key, heartbeat_callback); + } + + fn unregister_callback( + &self, + worker_instance_key: Uuid, + ) -> (Option, bool) { + let mut callbacks = self.callbacks.write(); + let callback = callbacks.remove(&worker_instance_key); + let is_empty = callbacks.is_empty(); + (callback, is_empty) + } + + fn num_workers(&self) -> usize { + self.callbacks.read().len() + } + } + + fn new_mock_provider_with_heartbeat( + namespace: String, + task_queue: String, + heartbeat_enabled: bool, + worker_instance_key: Uuid, + ) -> MockClientWorker { + let mut mock_provider = MockClientWorker::new(); + mock_provider + .expect_try_reserve_wft_slot() + .returning(|| Some(new_mock_slot(false))); + mock_provider + .expect_namespace() + .return_const(namespace.clone()); + mock_provider.expect_task_queue().return_const(task_queue); + mock_provider + .expect_heartbeat_enabled() + .return_const(heartbeat_enabled); + mock_provider + .expect_worker_instance_key() + .return_const(worker_instance_key); + mock_provider.expect_deployment_options().return_const(None); + + if heartbeat_enabled { + mock_provider + .expect_heartbeat_callback() + .returning(|| Some(Arc::new(WorkerHeartbeat::default))); + + let namespace_clone = namespace.clone(); + mock_provider + .expect_new_shared_namespace_worker() + .returning(move || { + Ok(Box::new(MockSharedNamespaceWorker::new( + namespace_clone.clone(), + ))) + }); + } + + mock_provider + } + + #[test] + fn duplicate_namespace_task_queue_registration_fails() { + let manager = ClientWorkerSet::new(); + + let worker1 = new_mock_provider_with_heartbeat( + "test_namespace".to_string(), + "test_queue".to_string(), + true, + Uuid::new_v4(), + ); + + // Same namespace+task_queue but different worker instance + let worker2 = new_mock_provider_with_heartbeat( + "test_namespace".to_string(), + "test_queue".to_string(), + true, + Uuid::new_v4(), + ); + + manager.register_worker(Arc::new(worker1), false).unwrap(); + + // second worker register should fail due to duplicate namespace+task_queue + let result = manager.register_worker(Arc::new(worker2), false); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Registration of multiple workers on the same namespace and task queue") + ); + + assert_eq!(1, manager.num_providers()); + assert_eq!(manager.num_heartbeat_workers(), 1); + + let impl_ref = manager.worker_manager.read(); + assert_eq!(impl_ref.shared_worker.len(), 1); + assert!(impl_ref.shared_worker.contains_key("test_namespace")); + } + + #[test] + fn multiple_workers_same_namespace_share_heartbeat_manager() { + let manager = ClientWorkerSet::new(); + + let worker1 = new_mock_provider_with_heartbeat( + "shared_namespace".to_string(), + "queue1".to_string(), + true, + Uuid::new_v4(), + ); + + // Same namespace but different task queue + let worker2 = new_mock_provider_with_heartbeat( + "shared_namespace".to_string(), + "queue2".to_string(), + true, + Uuid::new_v4(), + ); + + manager.register_worker(Arc::new(worker1), false).unwrap(); + manager.register_worker(Arc::new(worker2), false).unwrap(); + + assert_eq!(2, manager.num_providers()); + assert_eq!(manager.num_heartbeat_workers(), 2); + + let impl_ref = manager.worker_manager.read(); + assert_eq!(impl_ref.shared_worker.len(), 1); + assert!(impl_ref.shared_worker.contains_key("shared_namespace")); + + let shared_worker = impl_ref.shared_worker.get("shared_namespace").unwrap(); + assert_eq!(shared_worker.namespace(), "shared_namespace"); + } + + #[test] + fn different_namespaces_get_separate_heartbeat_managers() { + let manager = ClientWorkerSet::new(); + let worker1 = new_mock_provider_with_heartbeat( + "namespace1".to_string(), + "queue1".to_string(), + true, + Uuid::new_v4(), + ); + let worker2 = new_mock_provider_with_heartbeat( + "namespace2".to_string(), + "queue1".to_string(), + true, + Uuid::new_v4(), + ); + + manager.register_worker(Arc::new(worker1), false).unwrap(); + manager.register_worker(Arc::new(worker2), false).unwrap(); + + assert_eq!(2, manager.num_providers()); + assert_eq!(manager.num_heartbeat_workers(), 2); + + let impl_ref = manager.worker_manager.read(); + assert_eq!(impl_ref.num_heartbeat_workers(), 2); + assert!(impl_ref.shared_worker.contains_key("namespace1")); + assert!(impl_ref.shared_worker.contains_key("namespace2")); + } + + #[test] + fn unregister_heartbeat_workers_cleans_up_shared_worker_when_last_removed() { + let manager = ClientWorkerSet::new(); + + // Create two workers with same namespace but different task queues + let worker1 = new_mock_provider_with_heartbeat( + "test_namespace".to_string(), + "queue1".to_string(), + true, + Uuid::new_v4(), + ); + let worker2 = new_mock_provider_with_heartbeat( + "test_namespace".to_string(), + "queue2".to_string(), + true, + Uuid::new_v4(), + ); + let worker_instance_key1 = worker1.worker_instance_key(); + let worker_instance_key2 = worker2.worker_instance_key(); + + assert_ne!(worker_instance_key1, worker_instance_key2); + + manager.register_worker(Arc::new(worker1), false).unwrap(); + manager.register_worker(Arc::new(worker2), false).unwrap(); + + // Verify initial state: 2 slot providers, 2 heartbeat workers, 1 shared worker + assert_eq!(2, manager.num_providers()); + assert_eq!(manager.num_heartbeat_workers(), 2); + + let impl_ref = manager.worker_manager.read(); + assert_eq!(impl_ref.shared_worker.len(), 1); + assert!(impl_ref.shared_worker.contains_key("test_namespace")); + assert_eq!( + impl_ref + .shared_worker + .get("test_namespace") + .unwrap() + .num_workers(), + 2 + ); + drop(impl_ref); + + // Unregister first worker + manager.unregister_worker(worker_instance_key1).unwrap(); + + // After unregistering first worker: 1 slot provider, 1 heartbeat worker, shared worker still exists + assert_eq!(1, manager.num_providers()); + assert_eq!(manager.num_heartbeat_workers(), 1); + + let impl_ref = manager.worker_manager.read(); + assert_eq!(impl_ref.num_heartbeat_workers(), 1); // SharedNamespaceWorker still exists + assert!(impl_ref.shared_worker.contains_key("test_namespace")); + assert_eq!( + impl_ref + .shared_worker + .get("test_namespace") + .unwrap() + .num_workers(), + 1 + ); + drop(impl_ref); + + // Unregister second worker + manager.unregister_worker(worker_instance_key2).unwrap(); + + // After unregistering last worker: 0 slot providers, 0 heartbeat workers, shared worker is removed + assert_eq!(0, manager.num_providers()); + assert_eq!(manager.num_heartbeat_workers(), 0); + + let impl_ref = manager.worker_manager.read(); + assert_eq!(impl_ref.shared_worker.len(), 0); // SharedNamespaceWorker is cleaned up + assert!(!impl_ref.shared_worker.contains_key("test_namespace")); } } diff --git a/core-api/Cargo.toml b/core-api/Cargo.toml index ab2640dca..83042665a 100644 --- a/core-api/Cargo.toml +++ b/core-api/Cargo.toml @@ -31,6 +31,7 @@ tonic = { workspace = true } tracing = "0.1" tracing-core = "0.1" url = "2.5" +uuid = { version = "1.18.1", features = ["v4"] } [dependencies.temporal-sdk-core-protos] path = "../sdk-core-protos" diff --git a/core-api/src/lib.rs b/core-api/src/lib.rs index ca65ccae9..511c9383f 100644 --- a/core-api/src/lib.rs +++ b/core-api/src/lib.rs @@ -19,6 +19,7 @@ use temporal_sdk_core_protos::coresdk::{ workflow_activation::WorkflowActivation, workflow_completion::WorkflowActivationCompletion, }; +use uuid::Uuid; /// This trait is the primary way by which language specific SDKs interact with the core SDK. /// It represents one worker, which has a (potentially shared) client for connecting to the service @@ -138,6 +139,10 @@ pub trait Worker: Send + Sync { /// This should be called only after [Worker::shutdown] has resolved and/or both polling /// functions have returned `ShutDown` errors. async fn finalize_shutdown(self); + + /// Unique identifier for this worker instance. + /// This must be stable across the worker's lifetime and unique per instance. + fn worker_instance_key(&self) -> Uuid; } #[async_trait::async_trait] @@ -205,6 +210,10 @@ where async fn finalize_shutdown(self) { panic!("Can't finalize shutdown on Arc'd worker") } + + fn worker_instance_key(&self) -> Uuid { + (**self).worker_instance_key() + } } macro_rules! dbg_panic { diff --git a/core-api/src/telemetry/metrics.rs b/core-api/src/telemetry/metrics.rs index c1d8e2a1e..94b2b488d 100644 --- a/core-api/src/telemetry/metrics.rs +++ b/core-api/src/telemetry/metrics.rs @@ -5,7 +5,10 @@ use std::{ collections::{BTreeMap, HashMap}, fmt::{Debug, Display}, ops::Deref, - sync::{Arc, OnceLock}, + sync::{ + Arc, OnceLock, + atomic::{AtomicU64, Ordering}, + }, time::Duration, }; @@ -26,6 +29,18 @@ pub trait CoreMeter: Send + Sync + Debug { attribs: NewAttributes, ) -> MetricAttributes; fn counter(&self, params: MetricParameters) -> Counter; + + /// Create a counter with in-memory tracking for worker heartbeating reporting + fn counter_with_in_memory( + &self, + params: MetricParameters, + in_memory_counter: HeartbeatMetricType, + ) -> Counter { + let primary_counter = self.counter(params); + + Counter::new_with_in_memory(primary_counter.primary.metric.clone(), in_memory_counter) + } + fn histogram(&self, params: MetricParameters) -> Histogram; fn histogram_f64(&self, params: MetricParameters) -> HistogramF64; /// Create a histogram which records Durations. Implementations should choose to emit in @@ -33,10 +48,217 @@ pub trait CoreMeter: Send + Sync + Debug { /// [MetricParameters::unit] should be overwritten by implementations to be `ms` or `s` /// accordingly. fn histogram_duration(&self, params: MetricParameters) -> HistogramDuration; + + /// Create a histogram duration with in-memory tracking for worker heartbeating reporting + fn histogram_duration_with_in_memory( + &self, + params: MetricParameters, + in_memory_hist: HeartbeatMetricType, + ) -> HistogramDuration { + let primary_hist = self.histogram_duration(params); + + HistogramDuration::new_with_in_memory(primary_hist.primary.metric.clone(), in_memory_hist) + } fn gauge(&self, params: MetricParameters) -> Gauge; + + /// Create a gauge with in-memory tracking for worker heartbeating reporting + fn gauge_with_in_memory( + &self, + params: MetricParameters, + in_memory_metrics: HeartbeatMetricType, + ) -> Gauge { + let primary_gauge = self.gauge(params.clone()); + Gauge::new_with_in_memory(primary_gauge.primary.metric.clone(), in_memory_metrics) + } + fn gauge_f64(&self, params: MetricParameters) -> GaugeF64; } +/// Provides a generic way to record metrics in memory. +/// This can be done either with individual metrics or more fine-grained metrics +/// that vary by a set of labels for the same metric. +#[derive(Clone, Debug)] +pub enum HeartbeatMetricType { + Individual(Arc), + WithLabel { + label_key: String, + metrics: HashMap>, + }, +} + +impl HeartbeatMetricType { + fn record_counter(&self, delta: u64) { + match self { + HeartbeatMetricType::Individual(metric) => { + metric.fetch_add(delta, Ordering::Relaxed); + } + HeartbeatMetricType::WithLabel { .. } => { + dbg_panic!("Counter does not support in-memory metric with labels"); + } + } + } + + fn record_histogram_observation(&self) { + match self { + HeartbeatMetricType::Individual(metric) => { + metric.fetch_add(1, Ordering::Relaxed); + } + HeartbeatMetricType::WithLabel { .. } => { + dbg_panic!("Histogram does not support in-memory metric with labels"); + } + } + } + + fn record_gauge(&self, value: u64, attributes: &MetricAttributes) { + match self { + HeartbeatMetricType::Individual(metric) => { + metric.store(value, Ordering::Relaxed); + } + HeartbeatMetricType::WithLabel { label_key, metrics } => { + if let Some(metric) = label_value_from_attributes(attributes, label_key.as_str()) + .and_then(|label_value| metrics.get(label_value.as_str())) + { + metric.store(value, Ordering::Relaxed) + } + } + } + } +} + +fn label_value_from_attributes(attributes: &MetricAttributes, key: &str) -> Option { + match attributes { + MetricAttributes::Prometheus { labels } => labels.as_prom_labels().get(key).cloned(), + #[cfg(feature = "otel_impls")] + MetricAttributes::OTel { kvs } => kvs + .iter() + .find(|kv| kv.key.as_str() == key) + .map(|kv| kv.value.to_string()), + MetricAttributes::NoOp(labels) => labels.get(key).cloned(), + _ => None, + } +} + +#[derive(Default, Debug)] +pub struct NumPollersMetric { + pub wft_current_pollers: Arc, + pub sticky_wft_current_pollers: Arc, + pub activity_current_pollers: Arc, + pub nexus_current_pollers: Arc, +} + +impl NumPollersMetric { + pub fn as_map(&self) -> HashMap> { + HashMap::from([ + ( + "workflow_task".to_string(), + self.wft_current_pollers.clone(), + ), + ( + "sticky_workflow_task".to_string(), + self.sticky_wft_current_pollers.clone(), + ), + ( + "activity_task".to_string(), + self.activity_current_pollers.clone(), + ), + ("nexus_task".to_string(), self.nexus_current_pollers.clone()), + ]) + } +} + +#[derive(Default, Debug)] +pub struct SlotMetrics { + pub workflow_worker: Arc, + pub activity_worker: Arc, + pub nexus_worker: Arc, + pub local_activity_worker: Arc, +} + +impl SlotMetrics { + pub fn as_map(&self) -> HashMap> { + HashMap::from([ + ("WorkflowWorker".to_string(), self.workflow_worker.clone()), + ("ActivityWorker".to_string(), self.activity_worker.clone()), + ("NexusWorker".to_string(), self.nexus_worker.clone()), + ( + "LocalActivityWorker".to_string(), + self.local_activity_worker.clone(), + ), + ]) + } +} + +#[derive(Default, Debug)] +pub struct WorkerHeartbeatMetrics { + pub sticky_cache_size: Arc, + pub total_sticky_cache_hit: Arc, + pub total_sticky_cache_miss: Arc, + pub num_pollers: NumPollersMetric, + pub worker_task_slots_used: SlotMetrics, + pub worker_task_slots_available: SlotMetrics, + pub workflow_task_execution_failed: Arc, + pub activity_execution_failed: Arc, + pub nexus_task_execution_failed: Arc, + pub local_activity_execution_failed: Arc, + pub activity_execution_latency: Arc, + pub local_activity_execution_latency: Arc, + pub workflow_task_execution_latency: Arc, + pub nexus_task_execution_latency: Arc, +} + +impl WorkerHeartbeatMetrics { + pub fn get_metric(&self, name: &str) -> Option { + match name { + "sticky_cache_size" => Some(HeartbeatMetricType::Individual( + self.sticky_cache_size.clone(), + )), + "sticky_cache_hit" => Some(HeartbeatMetricType::Individual( + self.total_sticky_cache_hit.clone(), + )), + "sticky_cache_miss" => Some(HeartbeatMetricType::Individual( + self.total_sticky_cache_miss.clone(), + )), + "num_pollers" => Some(HeartbeatMetricType::WithLabel { + label_key: "poller_type".to_string(), + metrics: self.num_pollers.as_map(), + }), + "worker_task_slots_used" => Some(HeartbeatMetricType::WithLabel { + label_key: "worker_type".to_string(), + metrics: self.worker_task_slots_used.as_map(), + }), + "worker_task_slots_available" => Some(HeartbeatMetricType::WithLabel { + label_key: "worker_type".to_string(), + metrics: self.worker_task_slots_available.as_map(), + }), + "workflow_task_execution_failed" => Some(HeartbeatMetricType::Individual( + self.workflow_task_execution_failed.clone(), + )), + "activity_execution_failed" => Some(HeartbeatMetricType::Individual( + self.activity_execution_failed.clone(), + )), + "nexus_task_execution_failed" => Some(HeartbeatMetricType::Individual( + self.nexus_task_execution_failed.clone(), + )), + "local_activity_execution_failed" => Some(HeartbeatMetricType::Individual( + self.local_activity_execution_failed.clone(), + )), + "activity_execution_latency" => Some(HeartbeatMetricType::Individual( + self.activity_execution_latency.clone(), + )), + "local_activity_execution_latency" => Some(HeartbeatMetricType::Individual( + self.local_activity_execution_latency.clone(), + )), + "workflow_task_execution_latency" => Some(HeartbeatMetricType::Individual( + self.workflow_task_execution_latency.clone(), + )), + "nexus_task_execution_latency" => Some(HeartbeatMetricType::Individual( + self.nexus_task_execution_latency.clone(), + )), + _ => None, + } + } +} + #[derive(Debug, Clone, derive_builder::Builder)] pub struct MetricParameters { /// The name for the new metric/instrument @@ -125,6 +347,7 @@ pub enum MetricAttributes { }, Buffer(BufferAttributes), Dynamic(Arc), + NoOp(Arc>), Empty, } @@ -156,6 +379,16 @@ where } } +impl From for HashMap { + fn from(value: NewAttributes) -> Self { + value + .attributes + .into_iter() + .map(|kv| (kv.key, kv.value.to_string())) + .collect() + } +} + /// A K/V pair that can be used to label a specific recording of a metric #[derive(Clone, Debug, PartialEq)] pub struct MetricKeyValue { @@ -228,43 +461,79 @@ impl LazyBoundMetric { pub trait CounterBase: Send + Sync { fn adds(&self, value: u64); } -pub type Counter = LazyBoundMetric< + +pub type CounterImpl = LazyBoundMetric< Arc> + Send + Sync>, Arc, >; + +#[derive(Clone)] +pub struct Counter { + primary: CounterImpl, + in_memory: Option, +} impl Counter { pub fn new(inner: Arc> + Send + Sync>) -> Self { Self { - metric: inner, - attributes: MetricAttributes::Empty, - bound_cache: OnceLock::new(), + primary: LazyBoundMetric { + metric: inner, + attributes: MetricAttributes::Empty, + bound_cache: OnceLock::new(), + }, + in_memory: None, + } + } + + pub fn new_with_in_memory( + primary: Arc> + Send + Sync>, + in_memory: HeartbeatMetricType, + ) -> Self { + Self { + primary: LazyBoundMetric { + metric: primary, + attributes: MetricAttributes::Empty, + bound_cache: OnceLock::new(), + }, + in_memory: Some(in_memory), } } + pub fn add(&self, value: u64, attributes: &MetricAttributes) { - match self.metric.with_attributes(attributes) { - Ok(base) => { - base.adds(value); - } + match self.primary.metric.with_attributes(attributes) { + Ok(base) => base.adds(value), Err(e) => { - dbg_panic!("Failed to initialize metric, will drop values: {e:?}",); + dbg_panic!("Failed to initialize primary metric, will drop values: {e:?}"); } } + + if let Some(ref in_mem) = self.in_memory { + in_mem.record_counter(value); + } + } + + pub fn update_attributes(&mut self, new_attributes: MetricAttributes) { + self.primary.update_attributes(new_attributes.clone()); } } impl CounterBase for Counter { fn adds(&self, value: u64) { // TODO: Replace all of these with below when stable // https://doc.rust-lang.org/std/sync/struct.OnceLock.html#method.get_or_try_init - let bound = self.bound_cache.get_or_init(|| { - self.metric - .with_attributes(&self.attributes) + let bound = self.primary.bound_cache.get_or_init(|| { + self.primary + .metric + .with_attributes(&self.primary.attributes) .map(Into::into) .unwrap_or_else(|e| { - dbg_panic!("Failed to initialize metric, will drop values: {e:?}"); + dbg_panic!("Failed to initialize primary metric, will drop values: {e:?}"); Arc::new(NoOpInstrument) as Arc }) }); bound.adds(value); + + if let Some(ref in_mem) = self.in_memory { + in_mem.record_counter(value); + } } } impl MetricAttributable for Counter { @@ -272,10 +541,15 @@ impl MetricAttributable for Counter { &self, attributes: &MetricAttributes, ) -> Result> { - Ok(Self { - metric: self.metric.clone(), + let primary = LazyBoundMetric { + metric: self.primary.metric.clone(), attributes: attributes.clone(), bound_cache: OnceLock::new(), + }; + + Ok(Counter { + primary, + in_memory: self.in_memory.clone(), }) } } @@ -391,22 +665,45 @@ impl MetricAttributable for HistogramF64 { pub trait HistogramDurationBase: Send + Sync { fn records(&self, value: Duration); } -pub type HistogramDuration = LazyBoundMetric< + +pub type HistogramDurationImpl = LazyBoundMetric< Arc> + Send + Sync>, Arc, >; + +#[derive(Clone)] +pub struct HistogramDuration { + primary: HistogramDurationImpl, + in_memory: Option, +} impl HistogramDuration { pub fn new( inner: Arc> + Send + Sync>, ) -> Self { Self { - metric: inner, - attributes: MetricAttributes::Empty, - bound_cache: OnceLock::new(), + primary: LazyBoundMetric { + metric: inner, + attributes: MetricAttributes::Empty, + bound_cache: OnceLock::new(), + }, + in_memory: None, + } + } + pub fn new_with_in_memory( + primary: Arc> + Send + Sync>, + in_memory: HeartbeatMetricType, + ) -> Self { + Self { + primary: LazyBoundMetric { + metric: primary, + attributes: MetricAttributes::Empty, + bound_cache: OnceLock::new(), + }, + in_memory: Some(in_memory), } } pub fn record(&self, value: Duration, attributes: &MetricAttributes) { - match self.metric.with_attributes(attributes) { + match self.primary.metric.with_attributes(attributes) { Ok(base) => { base.records(value); } @@ -414,13 +711,22 @@ impl HistogramDuration { dbg_panic!("Failed to initialize metric, will drop values: {e:?}",); } } + + if let Some(ref in_mem) = self.in_memory { + in_mem.record_histogram_observation(); + } + } + + pub fn update_attributes(&mut self, new_attributes: MetricAttributes) { + self.primary.update_attributes(new_attributes.clone()); } } impl HistogramDurationBase for HistogramDuration { fn records(&self, value: Duration) { - let bound = self.bound_cache.get_or_init(|| { - self.metric - .with_attributes(&self.attributes) + let bound = self.primary.bound_cache.get_or_init(|| { + self.primary + .metric + .with_attributes(&self.primary.attributes) .map(Into::into) .unwrap_or_else(|e| { dbg_panic!("Failed to initialize metric, will drop values: {e:?}"); @@ -428,6 +734,10 @@ impl HistogramDurationBase for HistogramDuration { }) }); bound.records(value); + + if let Some(ref in_mem) = self.in_memory { + in_mem.record_histogram_observation(); + } } } impl MetricAttributable for HistogramDuration { @@ -435,10 +745,15 @@ impl MetricAttributable for HistogramDuration { &self, attributes: &MetricAttributes, ) -> Result> { - Ok(Self { - metric: self.metric.clone(), + let primary = LazyBoundMetric { + metric: self.primary.metric.clone(), attributes: attributes.clone(), bound_cache: OnceLock::new(), + }; + + Ok(HistogramDuration { + primary, + in_memory: self.in_memory.clone(), }) } } @@ -446,41 +761,77 @@ impl MetricAttributable for HistogramDuration { pub trait GaugeBase: Send + Sync { fn records(&self, value: u64); } -pub type Gauge = LazyBoundMetric< + +pub type GaugeImpl = LazyBoundMetric< Arc> + Send + Sync>, Arc, >; + +#[derive(Clone)] +pub struct Gauge { + primary: GaugeImpl, + in_memory: Option, +} impl Gauge { pub fn new(inner: Arc> + Send + Sync>) -> Self { Self { - metric: inner, - attributes: MetricAttributes::Empty, - bound_cache: OnceLock::new(), + primary: LazyBoundMetric { + metric: inner, + attributes: MetricAttributes::Empty, + bound_cache: OnceLock::new(), + }, + in_memory: None, } } + + pub fn new_with_in_memory( + primary: Arc> + Send + Sync>, + in_memory: HeartbeatMetricType, + ) -> Self { + Self { + primary: LazyBoundMetric { + metric: primary, + attributes: MetricAttributes::Empty, + bound_cache: OnceLock::new(), + }, + in_memory: Some(in_memory), + } + } + pub fn record(&self, value: u64, attributes: &MetricAttributes) { - match self.metric.with_attributes(attributes) { - Ok(base) => { - base.records(value); - } + match self.primary.metric.with_attributes(attributes) { + Ok(base) => base.records(value), Err(e) => { - dbg_panic!("Failed to initialize metric, will drop values: {e:?}",); + dbg_panic!("Failed to initialize primary metric, will drop values: {e:?}"); } } + + if let Some(ref in_mem) = self.in_memory { + in_mem.record_gauge(value, attributes); + } + } + + pub fn update_attributes(&mut self, new_attributes: MetricAttributes) { + self.primary.update_attributes(new_attributes.clone()); } } impl GaugeBase for Gauge { fn records(&self, value: u64) { - let bound = self.bound_cache.get_or_init(|| { - self.metric - .with_attributes(&self.attributes) + let bound = self.primary.bound_cache.get_or_init(|| { + self.primary + .metric + .with_attributes(&self.primary.attributes) .map(Into::into) .unwrap_or_else(|e| { - dbg_panic!("Failed to initialize metric, will drop values: {e:?}"); + dbg_panic!("Failed to initialize primary metric, will drop values: {e:?}"); Arc::new(NoOpInstrument) as Arc }) }); bound.records(value); + + if let Some(ref in_mem) = self.in_memory { + in_mem.record_gauge(value, &self.primary.attributes); + } } } impl MetricAttributable for Gauge { @@ -488,10 +839,15 @@ impl MetricAttributable for Gauge { &self, attributes: &MetricAttributes, ) -> Result> { - Ok(Self { - metric: self.metric.clone(), + let primary = LazyBoundMetric { + metric: self.primary.metric.clone(), attributes: attributes.clone(), bound_cache: OnceLock::new(), + }; + + Ok(Gauge { + primary, + in_memory: self.in_memory.clone(), }) } } @@ -634,12 +990,23 @@ impl LazyRef { #[derive(Debug)] pub struct NoOpCoreMeter; impl CoreMeter for NoOpCoreMeter { - fn new_attributes(&self, _: NewAttributes) -> MetricAttributes { - MetricAttributes::Dynamic(Arc::new(NoOpAttributes)) + fn new_attributes(&self, attribs: NewAttributes) -> MetricAttributes { + MetricAttributes::NoOp(Arc::new(attribs.into())) } - fn extend_attributes(&self, existing: MetricAttributes, _: NewAttributes) -> MetricAttributes { - existing + fn extend_attributes( + &self, + existing: MetricAttributes, + attribs: NewAttributes, + ) -> MetricAttributes { + if let MetricAttributes::NoOp(labels) = existing { + let mut labels = (*labels).clone(); + labels.extend::>(attribs.into()); + MetricAttributes::NoOp(Arc::new(labels)) + } else { + dbg_panic!("Must use NoOp attributes with a NoOp metric implementation"); + existing + } } fn counter(&self, _: MetricParameters) -> Counter { @@ -702,11 +1069,41 @@ impl_no_op!(HistogramDurationBase, Duration); impl_no_op!(GaugeBase, u64); impl_no_op!(GaugeF64Base, f64); -#[derive(Debug, Clone)] -pub struct NoOpAttributes; -impl CustomMetricAttributes for NoOpAttributes { - fn as_any(self: Arc) -> Arc { - self as Arc +#[cfg(test)] +mod tests { + use super::*; + use std::{ + collections::HashMap, + sync::{ + Arc, + atomic::{AtomicU64, Ordering}, + }, + }; + + #[test] + fn in_memory_attributes_provide_label_values() { + let meter = NoOpCoreMeter; + let base_attrs = meter.new_attributes(NewAttributes::default()); + let attrs = meter.extend_attributes( + base_attrs, + NewAttributes::from(vec![MetricKeyValue::new("poller_type", "workflow_task")]), + ); + + let value = Arc::new(AtomicU64::new(0)); + let mut metrics = HashMap::new(); + metrics.insert("workflow_task".to_string(), value.clone()); + let heartbeat_metric = HeartbeatMetricType::WithLabel { + label_key: "poller_type".to_string(), + metrics, + }; + + heartbeat_metric.record_gauge(3, &attrs); + + assert_eq!(value.load(Ordering::Relaxed), 3); + assert_eq!( + label_value_from_attributes(&attrs, "poller_type").as_deref(), + Some("workflow_task") + ); } } diff --git a/core-api/src/worker.rs b/core-api/src/worker.rs index b7304c5c9..1a96a6645 100644 --- a/core-api/src/worker.rs +++ b/core-api/src/worker.rs @@ -11,6 +11,7 @@ use temporal_sdk_core_protos::{ coresdk::{ActivitySlotInfo, LocalActivitySlotInfo, NexusSlotInfo, WorkflowSlotInfo}, temporal, temporal::api::enums::v1::VersioningBehavior, + temporal::api::worker::v1::PluginInfo, }; /// Defines per-worker configuration options @@ -141,19 +142,19 @@ pub struct WorkerConfig { /// Mutually exclusive with `tuner` #[builder(setter(into, strip_option), default)] pub max_outstanding_workflow_tasks: Option, - /// The maximum number of activity tasks that will ever be given to this worker concurrently + /// The maximum number of activity tasks that will ever be given to this worker concurrently. /// /// Mutually exclusive with `tuner` #[builder(setter(into, strip_option), default)] pub max_outstanding_activities: Option, /// The maximum number of local activity tasks that will ever be given to this worker - /// concurrently + /// concurrently. /// /// Mutually exclusive with `tuner` #[builder(setter(into, strip_option), default)] pub max_outstanding_local_activities: Option, /// The maximum number of nexus tasks that will ever be given to this worker - /// concurrently + /// concurrently. /// /// Mutually exclusive with `tuner` #[builder(setter(into, strip_option), default)] @@ -162,11 +163,13 @@ pub struct WorkerConfig { /// A versioning strategy for this worker. pub versioning_strategy: WorkerVersioningStrategy, - /// The interval within which the worker will send a heartbeat. - /// The timer is reset on each existing RPC call that also happens to send this data, like - /// `PollWorkflowTaskQueueRequest`. + /// List of plugins used by lang. #[builder(default)] - pub heartbeat_interval: Option, + pub plugins: Vec, + + /// Skips the single worker+client+namespace+task_queue check + #[builder(default = "false")] + pub skip_client_worker_set_check: bool, } impl WorkerConfig { @@ -363,6 +366,12 @@ pub trait SlotSupplier { fn available_slots(&self) -> Option { None } + + /// Returns a human-friendly identifier describing this supplier implementation for + /// diagnostics and telemetry. + fn slot_supplier_kind(&self) -> String { + "Custom".to_string() + } } pub trait SlotReservationContext: Send + Sync { diff --git a/core-c-bridge/include/temporal-sdk-core-c-bridge.h b/core-c-bridge/include/temporal-sdk-core-c-bridge.h index 0bc69d4f5..3390fd2e3 100644 --- a/core-c-bridge/include/temporal-sdk-core-c-bridge.h +++ b/core-c-bridge/include/temporal-sdk-core-c-bridge.h @@ -446,6 +446,7 @@ typedef struct TemporalCoreTelemetryOptions { typedef struct TemporalCoreRuntimeOptions { const struct TemporalCoreTelemetryOptions *telemetry; + uint64_t worker_heartbeat_duration_millis; } TemporalCoreRuntimeOptions; typedef struct TemporalCoreTestServerOptions { @@ -985,8 +986,8 @@ void temporal_core_worker_validate(struct TemporalCoreWorker *worker, void *user_data, TemporalCoreWorkerCallback callback); -void temporal_core_worker_replace_client(struct TemporalCoreWorker *worker, - struct TemporalCoreClient *new_client); +const struct TemporalCoreByteArray *temporal_core_worker_replace_client(struct TemporalCoreWorker *worker, + struct TemporalCoreClient *new_client); void temporal_core_worker_poll_workflow_activation(struct TemporalCoreWorker *worker, void *user_data, diff --git a/core-c-bridge/src/runtime.rs b/core-c-bridge/src/runtime.rs index 94fe46929..5a330e268 100644 --- a/core-c-bridge/src/runtime.rs +++ b/core-c-bridge/src/runtime.rs @@ -16,7 +16,8 @@ use std::{ time::{Duration, UNIX_EPOCH}, }; use temporal_sdk_core::{ - CoreRuntime, TokioRuntimeBuilder, + CoreRuntime, RuntimeOptions as CoreRuntimeOptions, + RuntimeOptionsBuilder as CoreRuntimeOptionsBuilder, TokioRuntimeBuilder, telemetry::{build_otlp_metric_exporter, start_prometheus_metric_exporter}, }; use temporal_sdk_core_api::telemetry::{ @@ -30,6 +31,7 @@ use url::Url; #[repr(C)] pub struct RuntimeOptions { pub telemetry: *const TelemetryOptions, + pub worker_heartbeat_duration_millis: u64, } #[repr(C)] @@ -142,7 +144,7 @@ pub extern "C" fn temporal_core_runtime_new(options: *const RuntimeOptions) -> R let mut runtime = Runtime { core: Arc::new( CoreRuntime::new( - CoreTelemetryOptions::default(), + CoreRuntimeOptions::default(), TokioRuntimeBuilder::default(), ) .unwrap(), @@ -238,8 +240,21 @@ impl Runtime { CoreTelemetryOptions::default() }; + let heartbeat_interval = if options.worker_heartbeat_duration_millis == 0 { + None + } else { + Some(Duration::from_millis( + options.worker_heartbeat_duration_millis, + )) + }; + + let core_runtime_options = CoreRuntimeOptionsBuilder::default() + .telemetry_options(telemetry_options) + .heartbeat_interval(heartbeat_interval) + .build()?; + // Build core runtime - let mut core = CoreRuntime::new(telemetry_options, TokioRuntimeBuilder::default())?; + let mut core = CoreRuntime::new(core_runtime_options, TokioRuntimeBuilder::default())?; // We late-bind the metrics after core runtime is created since it needs // the Tokio handle diff --git a/core-c-bridge/src/tests/context.rs b/core-c-bridge/src/tests/context.rs index 9af21316c..0fb7aaa04 100644 --- a/core-c-bridge/src/tests/context.rs +++ b/core-c-bridge/src/tests/context.rs @@ -153,6 +153,7 @@ impl Context { let RuntimeOrFail { runtime, fail } = temporal_core_runtime_new(&RuntimeOptions { telemetry: std::ptr::null(), + worker_heartbeat_duration_millis: 0, }); if let Some(fail) = byte_array_to_string(runtime, fail) { diff --git a/core-c-bridge/src/worker.rs b/core-c-bridge/src/worker.rs index 6308dd8bc..a68f5696a 100644 --- a/core-c-bridge/src/worker.rs +++ b/core-c-bridge/src/worker.rs @@ -629,11 +629,20 @@ pub extern "C" fn temporal_core_worker_validate( pub extern "C" fn temporal_core_worker_replace_client( worker: *mut Worker, new_client: *mut Client, -) { +) -> *const ByteArray { let worker = unsafe { &*worker }; let core_worker = worker.worker.as_ref().expect("missing worker").clone(); let client = unsafe { &*new_client }; - core_worker.replace_client(client.core.get_client().clone()); + + match core_worker.replace_client(client.core.get_client().clone()) { + Ok(()) => std::ptr::null(), + Err(err) => worker + .runtime + .clone() + .alloc_utf8(&format!("Replace client failed: {err}")) + .into_raw() + .cast_const(), + } } /// If success or fail are present, they must be freed. They will both be null diff --git a/core/src/abstractions.rs b/core/src/abstractions.rs index d4b86cb35..0d5a53206 100644 --- a/core/src/abstractions.rs +++ b/core/src/abstractions.rs @@ -25,6 +25,7 @@ use tokio_util::sync::CancellationToken; #[derive(Clone)] pub(crate) struct MeteredPermitDealer { supplier: Arc + Send + Sync>, + slot_supplier_kind: SlotSupplierKind, /// The number of permit owners who have acquired a permit, but are not yet meaningfully using /// that permit. This is useful for giving a more semantically accurate count of used task /// slots, since we typically wait for a permit first before polling, but that slot isn't used @@ -54,6 +55,35 @@ pub(crate) struct PermitDealerContextData { pub(crate) worker_deployment_version: Option, } +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) enum SlotSupplierKind { + Fixed, + ResourceBased, + Custom(String), +} + +impl SlotSupplierKind { + fn from_label(label: &str) -> Self { + if label == "Fixed" { + SlotSupplierKind::Fixed + } else if label == "ResourceBased" { + SlotSupplierKind::ResourceBased + } else { + SlotSupplierKind::Custom(label.to_string()) + } + } +} + +impl std::fmt::Display for SlotSupplierKind { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + SlotSupplierKind::Fixed => f.write_str("Fixed"), + SlotSupplierKind::ResourceBased => f.write_str("ResourceBased"), + SlotSupplierKind::Custom(name) => f.write_str(name.as_str()), + } + } +} + impl MeteredPermitDealer where SK: SlotKind + 'static, @@ -65,8 +95,11 @@ where context_data: Arc, meter: Option, ) -> Self { + let supplier_kind_label = supplier.slot_supplier_kind(); + let slot_supplier_kind = SlotSupplierKind::from_label(supplier_kind_label.as_ref()); Self { supplier, + slot_supplier_kind, unused_claimants: Arc::new(AtomicUsize::new(0)), extant_permits: watch::channel(0), metrics_ctx, @@ -81,6 +114,10 @@ where self.supplier.available_slots() } + pub(crate) fn slot_supplier_kind(&self) -> &SlotSupplierKind { + &self.slot_supplier_kind + } + #[cfg(test)] pub(crate) fn unused_permits(&self) -> Option { self.available_permits() @@ -492,4 +529,10 @@ pub(crate) mod tests { // Now it'll proceed acquire_fut.await; } + + #[test] + fn captures_slot_supplier_kind() { + let dealer = fixed_size_permit_dealer::(1); + assert_eq!(*dealer.slot_supplier_kind(), SlotSupplierKind::Fixed); + } } diff --git a/core/src/core_tests/activity_tasks.rs b/core/src/core_tests/activity_tasks.rs index 6a3acdaaf..b508a10e0 100644 --- a/core/src/core_tests/activity_tasks.rs +++ b/core/src/core_tests/activity_tasks.rs @@ -1,7 +1,7 @@ use crate::{ ActivityHeartbeat, Worker, advance_fut, job_assert, prost_dur, test_help::{ - MockPollCfg, MockWorkerInputs, MocksHolder, QueueResponse, TEST_Q, WorkerExt, + MockPollCfg, MockWorkerInputs, MocksHolder, QueueResponse, WorkerExt, WorkflowCachingPolicy, build_fake_worker, build_mock_pollers, fanout_tasks, gen_assert_and_reply, mock_manual_poller, mock_poller, mock_worker, poll_and_reply, single_hist_mock_sg, test_worker_cfg, @@ -734,7 +734,7 @@ async fn no_eager_activities_requested_when_worker_options_disable_it( ScheduleActivity { seq: 1, activity_id: "act_id".to_string(), - task_queue: TEST_Q.to_string(), + task_queue: core.get_config().task_queue.clone(), cancellation_type: ActivityCancellationType::TryCancel as i32, ..Default::default() } @@ -821,6 +821,7 @@ async fn activity_tasks_from_completion_are_delivered() { let mut mock = build_mock_pollers(mh); mock.worker_cfg(|wc| wc.max_cached_workflows = 2); let core = mock_worker(mock); + let task_queue = core.get_config().task_queue.clone(); // Test start let wf_task = core.poll_workflow_activation().await.unwrap(); @@ -829,7 +830,7 @@ async fn activity_tasks_from_completion_are_delivered() { ScheduleActivity { seq, activity_id: format!("act_id_{seq}_same_queue"), - task_queue: TEST_Q.to_string(), + task_queue: task_queue.clone(), cancellation_type: ActivityCancellationType::TryCancel as i32, ..Default::default() } @@ -840,7 +841,7 @@ async fn activity_tasks_from_completion_are_delivered() { ScheduleActivity { seq: 4, activity_id: "act_id_same_queue_not_eager".to_string(), - task_queue: TEST_Q.to_string(), + task_queue: task_queue.clone(), cancellation_type: ActivityCancellationType::TryCancel as i32, ..Default::default() } diff --git a/core/src/core_tests/workers.rs b/core/src/core_tests/workers.rs index b29fbe4b5..68314c082 100644 --- a/core/src/core_tests/workers.rs +++ b/core/src/core_tests/workers.rs @@ -315,18 +315,18 @@ async fn worker_shutdown_api(#[case] use_cache: bool, #[case] api_success: bool) mock.expect_is_mock().returning(|| true); mock.expect_sdk_name_and_version() .returning(|| ("test-core".to_string(), "0.0.0".to_string())); - mock.expect_get_identity() + mock.expect_identity() .returning(|| "test-identity".to_string()); if use_cache { if api_success { mock.expect_shutdown_worker() .times(1) - .returning(|_| Ok(ShutdownWorkerResponse {})); + .returning(|_, _| Ok(ShutdownWorkerResponse {})); } else { // worker.shutdown() should succeed even if shutdown_worker fails mock.expect_shutdown_worker() .times(1) - .returning(|_| Err(tonic::Status::unavailable("fake shutdown error"))); + .returning(|_, _| Err(tonic::Status::unavailable("fake shutdown error"))); } } else { mock.expect_shutdown_worker().times(0); diff --git a/core/src/core_tests/workflow_tasks.rs b/core/src/core_tests/workflow_tasks.rs index e26b6c887..8d5df2b7a 100644 --- a/core/src/core_tests/workflow_tasks.rs +++ b/core/src/core_tests/workflow_tasks.rs @@ -2996,7 +2996,8 @@ async fn both_normal_and_sticky_pollers_poll_concurrently() { Arc::new(mock_client), None, None, - ); + ) + .unwrap(); for _ in 1..50 { let activation = worker.poll_workflow_activation().await.unwrap(); diff --git a/core/src/lib.rs b/core/src/lib.rs index 35c20fdce..e306d5f15 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -41,10 +41,9 @@ pub use temporal_sdk_core_protos as protos; pub use temporal_sdk_core_protos::TaskToken; pub use url::Url; pub use worker::{ - FixedSizeSlotSupplier, RealSysInfo, ResourceBasedSlotsOptions, - ResourceBasedSlotsOptionsBuilder, ResourceBasedTuner, ResourceSlotOptions, SlotSupplierOptions, - TunerBuilder, TunerHolder, TunerHolderOptions, TunerHolderOptionsBuilder, Worker, WorkerConfig, - WorkerConfigBuilder, + FixedSizeSlotSupplier, ResourceBasedSlotsOptions, ResourceBasedSlotsOptionsBuilder, + ResourceBasedTuner, ResourceSlotOptions, SlotSupplierOptions, TunerBuilder, TunerHolder, + TunerHolderOptions, TunerHolderOptionsBuilder, Worker, WorkerConfig, WorkerConfigBuilder, }; /// Expose [WorkerClient] symbols @@ -61,7 +60,8 @@ use crate::{ }; use anyhow::bail; use futures_util::Stream; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; +use std::time::Duration; use temporal_client::{ConfiguredClient, NamespacedClient, SharedReplaceableClient}; use temporal_sdk_core_api::{ Worker as WorkerTrait, @@ -89,40 +89,40 @@ pub fn init_worker( where CT: Into, { - if worker_config.namespace.is_empty() { + let namespace = worker_config.namespace.clone(); + if namespace.is_empty() { bail!("Worker namespace cannot be empty"); } let client = RetryClient::new( - SharedReplaceableClient::new(init_worker_client(&worker_config, client)), + SharedReplaceableClient::new(init_worker_client( + worker_config.namespace.clone(), + worker_config.client_identity_override.clone(), + client, + )), RetryConfig::default(), ); let client_ident = client.identity(); - let sticky_q = sticky_q_name_for_worker(&client_ident, &worker_config); + let sticky_q = sticky_q_name_for_worker(&client_ident, worker_config.max_cached_workflows); if client_ident.is_empty() { bail!("Client identity cannot be empty. Either lang or user should be setting this value"); } - let heartbeat_fn = worker_config - .heartbeat_interval - .map(|_| Arc::new(OnceLock::new())); - let client_bag = Arc::new(WorkerClientBag::new( client, - worker_config.namespace.clone(), - client_ident, + namespace.clone(), + client_ident.clone(), worker_config.versioning_strategy.clone(), - heartbeat_fn.clone(), )); - Ok(Worker::new( - worker_config, + Worker::new( + worker_config.clone(), sticky_q, - client_bag, + client_bag.clone(), Some(&runtime.telemetry), - heartbeat_fn, - )) + runtime.heartbeat_interval, + ) } /// Create a worker for replaying one or more existing histories. It will auto-shutdown as soon as @@ -142,12 +142,16 @@ where rwi.into_core_worker() } -pub(crate) fn init_worker_client(config: &WorkerConfig, client: CT) -> Client +pub(crate) fn init_worker_client( + namespace: String, + client_identity_override: Option, + client: CT, +) -> Client where CT: Into, { - let mut client = Client::new(*client.into().into_inner(), config.namespace.clone()); - if let Some(ref id_override) = config.client_identity_override { + let mut client = Client::new(*client.into().into_inner(), namespace.clone()); + if let Some(ref id_override) = client_identity_override { client.options_mut().identity.clone_from(id_override); } client @@ -157,9 +161,9 @@ where /// workflows. pub(crate) fn sticky_q_name_for_worker( process_identity: &str, - config: &WorkerConfig, + max_cached_workflows: usize, ) -> Option { - if config.max_cached_workflows > 0 { + if max_cached_workflows > 0 { Some(format!( "{}-{}", &process_identity, @@ -234,6 +238,21 @@ pub struct CoreRuntime { telemetry: TelemetryInstance, runtime: Option, runtime_handle: tokio::runtime::Handle, + heartbeat_interval: Option, +} + +/// Holds telemetry options, as well as worker heartbeat_interval. Construct with [RuntimeOptionsBuilder] +#[derive(derive_builder::Builder)] +#[non_exhaustive] +#[derive(Default)] +pub struct RuntimeOptions { + /// Telemetry configuration options. + #[builder(default)] + telemetry_options: TelemetryOptions, + /// Optional worker heartbeat interval - This configures the heartbeat setting of all + /// workers created using this runtime. + #[builder(default = "Some(Duration::from_secs(60))")] + heartbeat_interval: Option, } /// Wraps a [tokio::runtime::Builder] to allow layering multiple on_thread_start functions @@ -268,13 +287,13 @@ impl CoreRuntime { /// If a tokio runtime has already been initialized. To re-use an existing runtime, call /// [CoreRuntime::new_assume_tokio]. pub fn new( - telemetry_options: TelemetryOptions, + runtime_options: RuntimeOptions, mut tokio_builder: TokioRuntimeBuilder, ) -> Result where F: Fn() + Send + Sync + 'static, { - let telemetry = telemetry_init(telemetry_options)?; + let telemetry = telemetry_init(runtime_options.telemetry_options)?; let subscriber = telemetry.trace_subscriber(); let runtime = tokio_builder .inner @@ -289,7 +308,8 @@ impl CoreRuntime { }) .build()?; let _rg = runtime.enter(); - let mut me = Self::new_assume_tokio_initialized_telem(telemetry); + let mut me = + Self::new_assume_tokio_initialized_telem(telemetry, runtime_options.heartbeat_interval); me.runtime = Some(runtime); Ok(me) } @@ -299,9 +319,12 @@ impl CoreRuntime { /// /// # Panics /// If there is no currently active Tokio runtime - pub fn new_assume_tokio(telemetry_options: TelemetryOptions) -> Result { - let telemetry = telemetry_init(telemetry_options)?; - Ok(Self::new_assume_tokio_initialized_telem(telemetry)) + pub fn new_assume_tokio(runtime_options: RuntimeOptions) -> Result { + let telemetry = telemetry_init(runtime_options.telemetry_options)?; + Ok(Self::new_assume_tokio_initialized_telem( + telemetry, + runtime_options.heartbeat_interval, + )) } /// Construct a runtime from an already-initialized telemetry instance, assuming a tokio runtime @@ -309,7 +332,10 @@ impl CoreRuntime { /// /// # Panics /// If there is no currently active Tokio runtime - pub fn new_assume_tokio_initialized_telem(telemetry: TelemetryInstance) -> Self { + pub fn new_assume_tokio_initialized_telem( + telemetry: TelemetryInstance, + heartbeat_interval: Option, + ) -> Self { let runtime_handle = tokio::runtime::Handle::current(); if let Some(sub) = telemetry.trace_subscriber() { set_trace_subscriber_for_current_thread(sub); @@ -318,6 +344,7 @@ impl CoreRuntime { telemetry, runtime: None, runtime_handle, + heartbeat_interval, } } diff --git a/core/src/pollers/poll_buffer.rs b/core/src/pollers/poll_buffer.rs index 72f2e8a41..786b195e4 100644 --- a/core/src/pollers/poll_buffer.rs +++ b/core/src/pollers/poll_buffer.rs @@ -6,8 +6,10 @@ use crate::{ client::{PollActivityOptions, PollOptions, PollWorkflowOptions, WorkerClient}, }, }; +use crossbeam_utils::atomic::AtomicCell; use futures_util::{FutureExt, StreamExt, future::BoxFuture}; use governor::{Quota, RateLimiter}; +use std::time::SystemTime; use std::{ cmp, fmt::Debug, @@ -74,9 +76,15 @@ impl LongPollBuffer { shutdown: CancellationToken, num_pollers_handler: Option, options: WorkflowTaskOptions, + last_successful_poll_time: Arc>>, ) -> Self { let is_sticky = sticky_queue.is_some(); - let poll_scaler = PollScaler::new(poller_behavior, num_pollers_handler, shutdown.clone()); + let poll_scaler = PollScaler::new( + poller_behavior, + num_pollers_handler, + shutdown.clone(), + last_successful_poll_time, + ); if let Some(wftps) = options.wft_poller_shared.as_ref() { if is_sticky { wftps.set_sticky_active(poll_scaler.active_rx.clone()); @@ -136,6 +144,7 @@ impl LongPollBuffer { } impl LongPollBuffer { + #[allow(clippy::too_many_arguments)] pub(crate) fn new_activity_task( client: Arc, task_queue: String, @@ -144,6 +153,7 @@ impl LongPollBuffer { shutdown: CancellationToken, num_pollers_handler: Option, options: ActivityTaskOptions, + last_successful_poll_time: Arc>>, ) -> Self { let pre_permit_delay = options .max_worker_acts_per_second @@ -183,7 +193,12 @@ impl LongPollBuffer { } }; - let poll_scaler = PollScaler::new(poller_behavior, num_pollers_handler, shutdown.clone()); + let poll_scaler = PollScaler::new( + poller_behavior, + num_pollers_handler, + shutdown.clone(), + last_successful_poll_time, + ); Self::new( poll_fn, permit_dealer, @@ -196,6 +211,7 @@ impl LongPollBuffer { } impl LongPollBuffer { + #[allow(clippy::too_many_arguments)] pub(crate) fn new_nexus_task( client: Arc, task_queue: String, @@ -203,6 +219,8 @@ impl LongPollBuffer { permit_dealer: MeteredPermitDealer, shutdown: CancellationToken, num_pollers_handler: Option, + last_successful_poll_time: Arc>>, + send_heartbeat: bool, ) -> Self { let no_retry = if matches!(poller_behavior, PollerBehavior::Autoscaling { .. }) { Some(NoRetryOnMatching { @@ -216,11 +234,14 @@ impl LongPollBuffer { let task_queue = task_queue.clone(); async move { client - .poll_nexus_task(PollOptions { - task_queue, - no_retry, - timeout_override, - }) + .poll_nexus_task( + PollOptions { + task_queue, + no_retry, + timeout_override, + }, + send_heartbeat, + ) .await } }; @@ -228,7 +249,12 @@ impl LongPollBuffer { poll_fn, permit_dealer, shutdown.clone(), - PollScaler::new(poller_behavior, num_pollers_handler, shutdown), + PollScaler::new( + poller_behavior, + num_pollers_handler, + shutdown, + last_successful_poll_time, + ), None:: BoxFuture<'static, ()>>, None::, ) @@ -413,6 +439,7 @@ where behavior: PollerBehavior, num_pollers_handler: Option, shutdown: CancellationToken, + last_successful_poll_time: Arc>>, ) -> Self { let (active_tx, active_rx) = watch::channel(0); let num_pollers_handler = num_pollers_handler.map(Arc::new); @@ -433,6 +460,7 @@ where ingested_this_period: Default::default(), ingested_last_period: Default::default(), scale_up_allowed: AtomicBool::new(true), + last_successful_poll_time, }); let rhc = report_handle.clone(); let ingestor_task = if behavior.is_autoscaling() { @@ -495,6 +523,7 @@ struct PollScalerReportHandle { ingested_this_period: AtomicUsize, ingested_last_period: AtomicUsize, scale_up_allowed: AtomicBool, + last_successful_poll_time: Arc>>, } impl PollScalerReportHandle { @@ -502,6 +531,8 @@ impl PollScalerReportHandle { fn poll_result(&self, res: &Result) -> bool { match res { Ok(res) => { + self.last_successful_poll_time + .store(Some(SystemTime::now())); if let PollerBehavior::SimpleMaximum(_) = self.behavior { // We don't do auto-scaling with the simple max return true; @@ -735,6 +766,7 @@ mod tests { WorkflowTaskOptions { wft_poller_shared: Some(Arc::new(WFTPollerShared::new(Some(10)))), }, + Arc::new(AtomicCell::new(None)), ); // Poll a bunch of times, "interrupting" it each time, we should only actually have polled @@ -790,6 +822,7 @@ mod tests { WorkflowTaskOptions { wft_poller_shared: Some(Arc::new(WFTPollerShared::new(Some(1)))), }, + Arc::new(AtomicCell::new(None)), ); // Should not see error, unwraps should get empty response diff --git a/core/src/replay/mod.rs b/core/src/replay/mod.rs index 650070b20..03f0003be 100644 --- a/core/src/replay/mod.rs +++ b/core/src/replay/mod.rs @@ -114,7 +114,7 @@ where hist_allow_tx.send("Failed".to_string()).unwrap(); async move { Ok(RespondWorkflowTaskFailedResponse::default()) }.boxed() }); - let mut worker = Worker::new(self.config, None, Arc::new(client), None, None); + let mut worker = Worker::new(self.config, None, Arc::new(client), None, None)?; worker.set_post_activate_hook(post_activate); shutdown_tok(worker.shutdown_token()); Ok(worker) diff --git a/core/src/telemetry/metrics.rs b/core/src/telemetry/metrics.rs index cd162b3ae..23cbbd42c 100644 --- a/core/src/telemetry/metrics.rs +++ b/core/src/telemetry/metrics.rs @@ -1,4 +1,6 @@ -use crate::{abstractions::dbg_panic, telemetry::TelemetryInstance}; +#[cfg(test)] +use crate::TelemetryInstance; +use crate::abstractions::dbg_panic; use std::{ fmt::{Debug, Display}, @@ -11,7 +13,7 @@ use temporal_sdk_core_api::telemetry::metrics::{ GaugeF64, GaugeF64Base, Histogram, HistogramBase, HistogramDuration, HistogramDurationBase, HistogramF64, HistogramF64Base, LazyBufferInstrument, MetricAttributable, MetricAttributes, MetricCallBufferer, MetricEvent, MetricKeyValue, MetricKind, MetricParameters, MetricUpdateVal, - NewAttributes, NoOpCoreMeter, + NewAttributes, NoOpCoreMeter, TemporalMeter, WorkerHeartbeatMetrics, }; use temporal_sdk_core_protos::temporal::api::{ enums::v1::WorkflowTaskFailedCause, failure::v1::Failure, @@ -23,6 +25,7 @@ pub(crate) struct MetricsContext { meter: Arc, kvs: MetricAttributes, instruments: Arc, + in_memory_metrics: Option>, } #[derive(Clone)] @@ -68,28 +71,41 @@ impl MetricsContext { pub(crate) fn no_op() -> Self { let meter = Arc::new(NoOpCoreMeter); let kvs = meter.new_attributes(Default::default()); - let instruments = Arc::new(Instruments::new(meter.as_ref())); + let in_memory_metrics = Some(Arc::new(WorkerHeartbeatMetrics::default())); + let instruments = Arc::new(Instruments::new(meter.as_ref(), in_memory_metrics.clone())); Self { kvs, instruments, meter, + in_memory_metrics, } } + #[cfg(test)] pub(crate) fn top_level(namespace: String, tq: String, telemetry: &TelemetryInstance) -> Self { - if let Some(mut meter) = telemetry.get_temporal_metric_meter() { + MetricsContext::top_level_with_meter(namespace, tq, telemetry.get_temporal_metric_meter()) + } + + pub(crate) fn top_level_with_meter( + namespace: String, + tq: String, + temporal_meter: Option, + ) -> Self { + if let Some(mut meter) = temporal_meter { meter .default_attribs .attributes .push(MetricKeyValue::new(KEY_NAMESPACE, namespace)); meter.default_attribs.attributes.push(task_queue(tq)); let kvs = meter.inner.new_attributes(meter.default_attribs); - let mut instruments = Instruments::new(meter.inner.as_ref()); + let in_memory_metrics = Some(Arc::new(WorkerHeartbeatMetrics::default())); + let mut instruments = Instruments::new(meter.inner.as_ref(), in_memory_metrics.clone()); instruments.update_attributes(&kvs); Self { kvs, instruments: Arc::new(instruments), meter: meter.inner, + in_memory_metrics, } } else { Self::no_op() @@ -110,9 +126,14 @@ impl MetricsContext { instruments: Arc::new(instruments), kvs, meter: self.meter.clone(), + in_memory_metrics: self.in_memory_metrics.clone(), } } + pub(crate) fn in_memory_meter(&self) -> Option> { + self.in_memory_metrics.clone() + } + /// A workflow task queue poll succeeded pub(crate) fn wf_tq_poll_ok(&self) { self.instruments.wf_task_queue_poll_succeed_counter.adds(1); @@ -288,7 +309,31 @@ impl MetricsContext { } impl Instruments { - fn new(meter: &dyn CoreMeter) -> Self { + fn new(meter: &dyn CoreMeter, in_memory: Option>) -> Self { + let counter_with_in_mem = |params: MetricParameters| -> Counter { + in_memory + .clone() + .and_then(|in_mem| in_mem.get_metric(¶ms.name)) + .map(|metric| meter.counter_with_in_memory(params.clone(), metric)) + .unwrap_or_else(|| meter.counter(params)) + }; + + let gauge_with_in_mem = |params: MetricParameters| -> Gauge { + in_memory + .clone() + .and_then(|in_mem| in_mem.get_metric(¶ms.name)) + .map(|metric| meter.gauge_with_in_memory(params.clone(), metric)) + .unwrap_or_else(|| meter.gauge(params)) + }; + + let histogram_with_in_mem = |params: MetricParameters| -> HistogramDuration { + in_memory + .clone() + .and_then(|in_mem| in_mem.get_metric(¶ms.name)) + .map(|metric| meter.histogram_duration_with_in_memory(params.clone(), metric)) + .unwrap_or_else(|| meter.histogram_duration(params)) + }; + Self { wf_completed_counter: meter.counter(MetricParameters { name: "workflow_completed".into(), @@ -320,12 +365,12 @@ impl Instruments { description: "Count of workflow task queue poll timeouts (no new task)".into(), unit: "".into(), }), - wf_task_queue_poll_succeed_counter: meter.counter(MetricParameters { + wf_task_queue_poll_succeed_counter: counter_with_in_mem(MetricParameters { name: "workflow_task_queue_poll_succeed".into(), description: "Count of workflow task queue poll successes".into(), unit: "".into(), }), - wf_task_execution_failure_counter: meter.counter(MetricParameters { + wf_task_execution_failure_counter: counter_with_in_mem(MetricParameters { name: "workflow_task_execution_failed".into(), description: "Count of workflow task execution failures".into(), unit: "".into(), @@ -340,7 +385,7 @@ impl Instruments { unit: "duration".into(), description: "Histogram of workflow task replay latencies".into(), }), - wf_task_execution_latency: meter.histogram_duration(MetricParameters { + wf_task_execution_latency: histogram_with_in_mem(MetricParameters { name: WORKFLOW_TASK_EXECUTION_LATENCY_HISTOGRAM_NAME.into(), unit: "duration".into(), description: "Histogram of workflow task execution (not replay) latencies".into(), @@ -350,12 +395,12 @@ impl Instruments { description: "Count of activity task queue poll timeouts (no new task)".into(), unit: "".into(), }), - act_task_received_counter: meter.counter(MetricParameters { + act_task_received_counter: counter_with_in_mem(MetricParameters { name: "activity_task_received".into(), description: "Count of activity task queue poll successes".into(), unit: "".into(), }), - act_execution_failed: meter.counter(MetricParameters { + act_execution_failed: counter_with_in_mem(MetricParameters { name: "activity_execution_failed".into(), description: "Count of activity task execution failures".into(), unit: "".into(), @@ -365,7 +410,7 @@ impl Instruments { unit: "duration".into(), description: "Histogram of activity schedule-to-start latencies".into(), }), - act_exec_latency: meter.histogram_duration(MetricParameters { + act_exec_latency: histogram_with_in_mem(MetricParameters { name: ACTIVITY_EXEC_LATENCY_HISTOGRAM_NAME.into(), unit: "duration".into(), description: "Histogram of activity execution latencies".into(), @@ -386,7 +431,7 @@ impl Instruments { description: "Count of local activity executions that failed".into(), unit: "".into(), }), - la_exec_latency: meter.histogram_duration(MetricParameters { + la_exec_latency: histogram_with_in_mem(MetricParameters { name: "local_activity_execution_latency".into(), unit: "duration".into(), description: "Histogram of local activity execution latencies".into(), @@ -398,7 +443,7 @@ impl Instruments { "Histogram of local activity execution latencies for successful local activities" .into(), }), - la_total: meter.counter(MetricParameters { + la_total: counter_with_in_mem(MetricParameters { name: "local_activity_total".into(), description: "Count of local activities executed".into(), unit: "".into(), @@ -418,12 +463,12 @@ impl Instruments { unit: "duration".into(), description: "Histogram of nexus task end-to-end latencies".into(), }), - nexus_task_execution_latency: meter.histogram_duration(MetricParameters { + nexus_task_execution_latency: histogram_with_in_mem(MetricParameters { name: "nexus_task_execution_latency".into(), unit: "duration".into(), description: "Histogram of nexus task execution latencies".into(), }), - nexus_task_execution_failed: meter.counter(MetricParameters { + nexus_task_execution_failed: counter_with_in_mem(MetricParameters { name: "nexus_task_execution_failed".into(), description: "Count of nexus task execution failures".into(), unit: "".into(), @@ -434,35 +479,34 @@ impl Instruments { description: "Count of the number of initialized workers".into(), unit: "".into(), }), - num_pollers: meter.gauge(MetricParameters { + num_pollers: gauge_with_in_mem(MetricParameters { name: NUM_POLLERS_NAME.into(), description: "Current number of active pollers per queue type".into(), unit: "".into(), }), - task_slots_available: meter.gauge(MetricParameters { + task_slots_available: gauge_with_in_mem(MetricParameters { name: TASK_SLOTS_AVAILABLE_NAME.into(), description: "Current number of available slots per task type".into(), unit: "".into(), }), - task_slots_used: meter.gauge(MetricParameters { + task_slots_used: gauge_with_in_mem(MetricParameters { name: TASK_SLOTS_USED_NAME.into(), description: "Current number of used slots per task type".into(), unit: "".into(), }), - sticky_cache_hit: meter.counter(MetricParameters { + sticky_cache_hit: counter_with_in_mem(MetricParameters { name: "sticky_cache_hit".into(), description: "Count of times the workflow cache was used for a new workflow task" .into(), unit: "".into(), }), - sticky_cache_miss: meter.counter(MetricParameters { + sticky_cache_miss: counter_with_in_mem(MetricParameters { name: "sticky_cache_miss".into(), description: - "Count of times the workflow cache was missing a workflow for a sticky task" - .into(), + "Count of times the workflow cache was missing a workflow for a sticky task".into(), unit: "".into(), }), - sticky_cache_size: meter.gauge(MetricParameters { + sticky_cache_size: gauge_with_in_mem(MetricParameters { name: STICKY_CACHE_SIZE_NAME.into(), description: "Current number of cached workflows".into(), unit: "".into(), diff --git a/core/src/telemetry/mod.rs b/core/src/telemetry/mod.rs index 14b82a33f..60f3756d6 100644 --- a/core/src/telemetry/mod.rs +++ b/core/src/telemetry/mod.rs @@ -39,6 +39,7 @@ use std::{ atomic::{AtomicBool, Ordering}, }, }; +pub(crate) use temporal_sdk_core_api::telemetry::metrics::WorkerHeartbeatMetrics; use temporal_sdk_core_api::telemetry::{ CoreLog, CoreTelemetry, Logger, TaskQueueLabelStrategy, TelemetryOptions, TelemetryOptionsBuilder, diff --git a/core/src/test_help/integ_helpers.rs b/core/src/test_help/integ_helpers.rs index 10e43b174..085f44e07 100644 --- a/core/src/test_help/integ_helpers.rs +++ b/core/src/test_help/integ_helpers.rs @@ -62,13 +62,11 @@ use temporal_sdk_core_protos::{ }; use tokio::sync::{Notify, mpsc::unbounded_channel}; use tokio_stream::wrappers::UnboundedReceiverStream; +use uuid::Uuid; /// Default namespace for testing pub const NAMESPACE: &str = "default"; -/// Default task queue for testing -pub const TEST_Q: &str = "q"; - /// Initiate shutdown, drain the pollers (handling evictions), and wait for shutdown to complete. pub async fn drain_pollers_and_shutdown(worker: &dyn WorkerTrait) { worker.initiate_shutdown(); @@ -102,7 +100,7 @@ pub async fn drain_pollers_and_shutdown(worker: &dyn WorkerTrait) { pub fn test_worker_cfg() -> WorkerConfigBuilder { let mut wcb = WorkerConfigBuilder::default(); wcb.namespace(NAMESPACE) - .task_queue(TEST_Q) + .task_queue(Uuid::new_v4().to_string()) .versioning_strategy(WorkerVersioningStrategy::None { build_id: "test_bin_id".to_string(), }) @@ -185,7 +183,7 @@ pub fn build_fake_worker( } pub fn mock_worker(mocks: MocksHolder) -> Worker { - let sticky_q = sticky_q_name_for_worker("unit-test", &mocks.inputs.config); + let sticky_q = sticky_q_name_for_worker("unit-test", mocks.inputs.config.max_cached_workflows); let act_poller = if mocks.inputs.config.no_remote_activities { None } else { @@ -205,7 +203,9 @@ pub fn mock_worker(mocks: MocksHolder) -> Worker { }, None, None, + false, ) + .unwrap() } pub struct FakeWfResponses { @@ -275,7 +275,7 @@ impl MocksHolder { } } - /// Uses the provided list of tasks to create a mock poller for the `TEST_Q` + /// Uses the provided list of tasks to create a mock poller with a randomly generated task queue pub fn from_client_with_activities( client: impl WorkerClient + 'static, act_tasks: ACT, diff --git a/core/src/worker/activities.rs b/core/src/worker/activities.rs index b64b1b16f..071c8d3ec 100644 --- a/core/src/worker/activities.rs +++ b/core/src/worker/activities.rs @@ -735,6 +735,7 @@ mod tests { prost_dur, worker::client::mocks::mock_worker_client, }; + use crossbeam_utils::atomic::AtomicCell; use temporal_sdk_core_api::worker::PollerBehavior; use temporal_sdk_core_protos::coresdk::activity_result::ActivityExecutionResult; @@ -780,6 +781,7 @@ mod tests { max_worker_acts_per_second: Some(2.0), max_tps: None, }, + Arc::new(AtomicCell::new(None)), ); let atm = WorkerActivityTasks::new( sem.clone(), @@ -871,6 +873,7 @@ mod tests { max_worker_acts_per_second: None, max_tps: None, }, + Arc::new(AtomicCell::new(None)), ); let atm = WorkerActivityTasks::new( sem.clone(), @@ -944,6 +947,7 @@ mod tests { max_worker_acts_per_second: None, max_tps: None, }, + Arc::new(AtomicCell::new(None)), ); let atm = WorkerActivityTasks::new( sem.clone(), diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index a480180d9..2fae0309a 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -1,18 +1,19 @@ //! Worker-specific client needs pub(crate) mod mocks; -use crate::{ - abstractions::dbg_panic, protosext::legacy_query_failure, worker::heartbeat::HeartbeatFn, -}; -use std::{ - sync::{Arc, OnceLock}, - time::Duration, -}; +use crate::protosext::legacy_query_failure; +use parking_lot::Mutex; +use prost_types::Duration as PbDuration; +use std::collections::HashMap; +use std::time::SystemTime; +use std::{sync::Arc, time::Duration}; use temporal_client::{ - Client, IsWorkerTaskLongPoll, Namespace, NamespacedClient, NoRetryOnMatching, RetryClient, - SharedReplaceableClient, SlotManager, WorkflowService, + Client, ClientWorkerSet, IsWorkerTaskLongPoll, Namespace, NamespacedClient, NoRetryOnMatching, + RetryClient, SharedReplaceableClient, WorkflowService, }; use temporal_sdk_core_api::worker::WorkerVersioningStrategy; +use temporal_sdk_core_protos::temporal::api::enums::v1::WorkerStatus; +use temporal_sdk_core_protos::temporal::api::worker::v1::WorkerSlotsInfo; use temporal_sdk_core_protos::{ TaskToken, coresdk::{workflow_commands::QueryResult, workflow_completion}, @@ -37,6 +38,7 @@ use temporal_sdk_core_protos::{ }, }; use tonic::IntoRequest; +use uuid::Uuid; type Result = std::result::Result; @@ -51,7 +53,7 @@ pub(crate) struct WorkerClientBag { namespace: String, identity: String, worker_versioning_strategy: WorkerVersioningStrategy, - heartbeat_data: Option>>, + worker_heartbeat_map: Arc>>, } impl WorkerClientBag { @@ -60,14 +62,13 @@ impl WorkerClientBag { namespace: String, identity: String, worker_versioning_strategy: WorkerVersioningStrategy, - heartbeat_data: Option>>, ) -> Self { Self { client, namespace, identity, worker_versioning_strategy, - heartbeat_data, + worker_heartbeat_map: Arc::new(Mutex::new(HashMap::new())), } } @@ -124,19 +125,6 @@ impl WorkerClientBag { None } } - - fn capture_heartbeat(&self) -> Option { - if let Some(heartbeat_data) = self.heartbeat_data.as_ref() { - if let Some(hb) = heartbeat_data.get() { - hb() - } else { - dbg_panic!("Heartbeat function never set"); - None - } - } else { - None - } - } } /// This trait contains everything workers need to interact with Temporal, and hence provides a @@ -160,6 +148,7 @@ pub trait WorkerClient: Sync + Send { async fn poll_nexus_task( &self, poll_options: PollOptions, + send_heartbeat: bool, ) -> Result; /// Complete a workflow task async fn complete_workflow_task( @@ -225,11 +214,16 @@ pub trait WorkerClient: Sync + Send { /// Describe the namespace async fn describe_namespace(&self) -> Result; /// Shutdown the worker - async fn shutdown_worker(&self, sticky_task_queue: String) -> Result; + async fn shutdown_worker( + &self, + sticky_task_queue: String, + final_heartbeat: Option, + ) -> Result; /// Record a worker heartbeat async fn record_worker_heartbeat( &self, - heartbeat: WorkerHeartbeat, + namespace: String, + worker_heartbeat: Vec, ) -> Result; /// Replace the underlying client @@ -237,13 +231,18 @@ pub trait WorkerClient: Sync + Send { /// Return server capabilities fn capabilities(&self) -> Option; /// Return workers using this client - fn workers(&self) -> Arc; + fn workers(&self) -> Arc; /// Indicates if this is a mock client fn is_mock(&self) -> bool; /// Return name and version of the SDK fn sdk_name_and_version(&self) -> (String, String); /// Get worker identity - fn get_identity(&self) -> String; + fn identity(&self) -> String; + /// Get worker grouping key + fn worker_grouping_key(&self) -> Uuid; + /// Sets the client-reliant fields for WorkerHeartbeat. This also updates client-level tracking + /// of heartbeat fields, like last heartbeat timestamp. + fn set_heartbeat_client_fields(&self, heartbeat: &mut WorkerHeartbeat); } /// Configuration options shared by workflow, activity, and Nexus polling calls @@ -357,6 +356,7 @@ impl WorkerClient for WorkerClientBag { async fn poll_nexus_task( &self, poll_options: PollOptions, + _send_heartbeat: bool, ) -> Result { #[allow(deprecated)] // want to list all fields explicitly let mut request = PollNexusTaskQueueRequest { @@ -369,7 +369,7 @@ impl WorkerClient for WorkerClientBag { identity: self.identity.clone(), worker_version_capabilities: self.worker_version_capabilities(), deployment_options: self.deployment_options(), - worker_heartbeat: self.capture_heartbeat().into_iter().collect(), + worker_heartbeat: Vec::new(), } .into_request(); request.extensions_mut().insert(IsWorkerTaskLongPoll); @@ -684,13 +684,22 @@ impl WorkerClient for WorkerClientBag { .into_inner()) } - async fn shutdown_worker(&self, sticky_task_queue: String) -> Result { + async fn shutdown_worker( + &self, + sticky_task_queue: String, + final_heartbeat: Option, + ) -> Result { + let mut final_heartbeat = final_heartbeat; + if let Some(w) = final_heartbeat.as_mut() { + w.status = WorkerStatus::Shutdown.into(); + self.set_heartbeat_client_fields(w); + } let request = ShutdownWorkerRequest { namespace: self.namespace.clone(), identity: self.identity.clone(), sticky_task_queue, reason: "graceful shutdown".to_string(), - worker_heartbeat: self.capture_heartbeat(), + worker_heartbeat: final_heartbeat, }; Ok( @@ -700,29 +709,28 @@ impl WorkerClient for WorkerClientBag { ) } - fn replace_client(&self, new_client: Client) { - self.client.get_client().replace_client(new_client); - } - async fn record_worker_heartbeat( &self, - heartbeat: WorkerHeartbeat, + namespace: String, + worker_heartbeat: Vec, ) -> Result { + let request = RecordWorkerHeartbeatRequest { + namespace, + identity: self.identity.clone(), + worker_heartbeat, + }; Ok(self .client .clone() - .record_worker_heartbeat( - RecordWorkerHeartbeatRequest { - namespace: self.namespace.clone(), - identity: self.identity.clone(), - worker_heartbeat: vec![heartbeat], - } - .into_request(), - ) + .record_worker_heartbeat(request.into_request()) .await? .into_inner()) } + fn replace_client(&self, new_client: Client) { + self.client.get_client().replace_client(new_client); + } + fn capabilities(&self) -> Option { self.client .get_client() @@ -732,7 +740,7 @@ impl WorkerClient for WorkerClientBag { .cloned() } - fn workers(&self) -> Arc { + fn workers(&self) -> Arc { self.client.get_client().inner_cow().inner().workers() } @@ -746,9 +754,57 @@ impl WorkerClient for WorkerClientBag { (opts.client_name.clone(), opts.client_version.clone()) } - fn get_identity(&self) -> String { + fn identity(&self) -> String { self.identity.clone() } + + fn worker_grouping_key(&self) -> Uuid { + self.client.get_client().inner_cow().worker_grouping_key() + } + + fn set_heartbeat_client_fields(&self, heartbeat: &mut WorkerHeartbeat) { + if let Some(host_info) = heartbeat.host_info.as_mut() { + host_info.process_key = self.worker_grouping_key().to_string(); + } + heartbeat.worker_identity = WorkerClient::identity(self); + let sdk_name_and_ver = self.sdk_name_and_version(); + heartbeat.sdk_name = sdk_name_and_ver.0; + heartbeat.sdk_version = sdk_name_and_ver.1; + + let now = SystemTime::now(); + heartbeat.heartbeat_time = Some(now.into()); + let mut heartbeat_map = self.worker_heartbeat_map.lock(); + let client_heartbeat_data = heartbeat_map + .entry(heartbeat.worker_instance_key.clone()) + .or_default(); + let elapsed_since_last_heartbeat = + client_heartbeat_data.last_heartbeat_time.map(|hb_time| { + let dur = now.duration_since(hb_time).unwrap_or(Duration::ZERO); + PbDuration { + seconds: dur.as_secs() as i64, + nanos: dur.subsec_nanos() as i32, + } + }); + heartbeat.elapsed_since_last_heartbeat = elapsed_since_last_heartbeat; + client_heartbeat_data.last_heartbeat_time = Some(now); + + update_slots( + &mut heartbeat.workflow_task_slots_info, + &mut client_heartbeat_data.workflow_task_slots_info, + ); + update_slots( + &mut heartbeat.activity_task_slots_info, + &mut client_heartbeat_data.activity_task_slots_info, + ); + update_slots( + &mut heartbeat.nexus_task_slots_info, + &mut client_heartbeat_data.nexus_task_slots_info, + ); + update_slots( + &mut heartbeat.local_activity_slots_info, + &mut client_heartbeat_data.local_activity_slots_info, + ); + } } impl NamespacedClient for WorkerClientBag { @@ -786,3 +842,31 @@ pub struct WorkflowTaskCompletion { /// Versioning behavior of the workflow, if any. pub versioning_behavior: VersioningBehavior, } + +#[derive(Clone, Default)] +struct SlotsInfo { + total_processed_tasks: i32, + total_failed_tasks: i32, +} + +#[derive(Clone, Default)] +struct ClientHeartbeatData { + last_heartbeat_time: Option, + + workflow_task_slots_info: SlotsInfo, + activity_task_slots_info: SlotsInfo, + nexus_task_slots_info: SlotsInfo, + local_activity_slots_info: SlotsInfo, +} + +fn update_slots(slots_info: &mut Option, client_heartbeat_data: &mut SlotsInfo) { + if let Some(wft_slot_info) = slots_info.as_mut() { + wft_slot_info.last_interval_processed_tasks = + wft_slot_info.total_processed_tasks - client_heartbeat_data.total_processed_tasks; + wft_slot_info.last_interval_failure_tasks = + wft_slot_info.total_failed_tasks - client_heartbeat_data.total_failed_tasks; + + client_heartbeat_data.total_processed_tasks = wft_slot_info.total_processed_tasks; + client_heartbeat_data.total_failed_tasks = wft_slot_info.total_failed_tasks; + } +} diff --git a/core/src/worker/client/mocks.rs b/core/src/worker/client/mocks.rs index 317eb04eb..26a1b18da 100644 --- a/core/src/worker/client/mocks.rs +++ b/core/src/worker/client/mocks.rs @@ -1,10 +1,10 @@ use super::*; use futures_util::Future; use std::sync::{Arc, LazyLock}; -use temporal_client::SlotManager; +use temporal_client::ClientWorkerSet; -pub(crate) static DEFAULT_WORKERS_REGISTRY: LazyLock> = - LazyLock::new(|| Arc::new(SlotManager::new())); +pub(crate) static DEFAULT_WORKERS_REGISTRY: LazyLock> = + LazyLock::new(|| Arc::new(ClientWorkerSet::new())); pub(crate) static DEFAULT_TEST_CAPABILITIES: &Capabilities = &Capabilities { signal_and_query_header: true, @@ -30,11 +30,18 @@ pub fn mock_worker_client() -> MockWorkerClient { .returning(|| DEFAULT_WORKERS_REGISTRY.clone()); r.expect_is_mock().returning(|| true); r.expect_shutdown_worker() - .returning(|_| Ok(ShutdownWorkerResponse {})); + .returning(|_, _| Ok(ShutdownWorkerResponse {})); r.expect_sdk_name_and_version() .returning(|| ("test-core".to_string(), "0.0.0".to_string())); - r.expect_get_identity() + r.expect_identity() .returning(|| "test-identity".to_string()); + r.expect_worker_grouping_key().returning(Uuid::new_v4); + r.expect_set_heartbeat_client_fields().returning(|hb| { + hb.sdk_name = "test-core".to_string(); + hb.sdk_version = "0.0.0".to_string(); + hb.worker_identity = "test-identity".to_string(); + hb.heartbeat_time = Some(SystemTime::now().into()); + }); r } @@ -48,7 +55,7 @@ pub(crate) fn mock_manual_worker_client() -> MockManualWorkerClient { r.expect_is_mock().returning(|| true); r.expect_sdk_name_and_version() .returning(|| ("test-core".to_string(), "0.0.0".to_string())); - r.expect_get_identity() + r.expect_identity() .returning(|| "test-identity".to_string()); r } @@ -68,7 +75,7 @@ mockall::mock! { -> impl Future> + Send + 'b where 'a: 'b, Self: 'b; - fn poll_nexus_task<'a, 'b>(&self, poll_options: PollOptions) + fn poll_nexus_task<'a, 'b>(&self, poll_options: PollOptions, send_heartbeat: bool) -> impl Future> + Send + 'b where 'a: 'b, Self: 'b; @@ -139,7 +146,7 @@ mockall::mock! { fn respond_legacy_query<'a, 'b>( &self, task_token: TaskToken, - query_result: LegacyQueryResult, + query_result: LegacyQueryResult, ) -> impl Future> + Send + 'b where 'a: 'b, Self: 'b; @@ -147,16 +154,22 @@ mockall::mock! { impl Future> + Send + 'b where 'a: 'b, Self: 'b; - fn shutdown_worker<'a, 'b>(&self, sticky_task_queue: String) -> impl Future> + Send + 'b + fn shutdown_worker<'a, 'b>(&self, sticky_task_queue: String, worker_heartbeat: Option) -> impl Future> + Send + 'b where 'a: 'b, Self: 'b; - fn record_worker_heartbeat<'a, 'b>(&self, heartbeat: WorkerHeartbeat) -> impl Future> + Send + 'b where 'a: 'b, Self: 'b; + fn record_worker_heartbeat<'a, 'b>( + &self, + namespace: String, + heartbeat: Vec + ) -> impl Future> + Send + 'b where 'a: 'b, Self: 'b; fn replace_client(&self, new_client: Client); fn capabilities(&self) -> Option; - fn workers(&self) -> Arc; + fn workers(&self) -> Arc; fn is_mock(&self) -> bool; fn sdk_name_and_version(&self) -> (String, String); - fn get_identity(&self) -> String; + fn identity(&self) -> String; + fn worker_grouping_key(&self) -> Uuid; + fn set_heartbeat_client_fields(&self, heartbeat: &mut WorkerHeartbeat); } } diff --git a/core/src/worker/heartbeat.rs b/core/src/worker/heartbeat.rs index f7c8d5694..7ec2f7aa5 100644 --- a/core/src/worker/heartbeat.rs +++ b/core/src/worker/heartbeat.rs @@ -1,55 +1,87 @@ -use crate::{WorkerClient, abstractions::dbg_panic}; -use gethostname::gethostname; +use crate::WorkerClient; +use crate::worker::{TaskPollers, WorkerTelemetry}; use parking_lot::Mutex; -use prost_types::Duration as PbDuration; -use std::{ - sync::{Arc, OnceLock}, - time::{Duration, SystemTime}, +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; +use temporal_client::SharedNamespaceWorkerTrait; +use temporal_sdk_core_api::worker::{ + PollerBehavior, WorkerConfigBuilder, WorkerVersioningStrategy, }; -use temporal_sdk_core_api::worker::WorkerConfig; -use temporal_sdk_core_protos::temporal::api::worker::v1::{WorkerHeartbeat, WorkerHostInfo}; -use tokio::{sync::Notify, task::JoinHandle, time::MissedTickBehavior}; +use temporal_sdk_core_protos::temporal::api::worker::v1::WorkerHeartbeat; +use tokio::sync::Notify; +use tokio_util::sync::CancellationToken; use uuid::Uuid; -pub(crate) type HeartbeatFn = Box Option + Send + Sync>; +/// Callback used to collect heartbeat data from each worker at the time of heartbeat +pub(crate) type HeartbeatFn = Arc WorkerHeartbeat + Send + Sync>; -pub(crate) struct WorkerHeartbeatManager { - heartbeat_handle: JoinHandle<()>, +/// SharedNamespaceWorker is responsible for polling nexus-delivered worker commands and sending +/// worker heartbeats to the server. This invokes callbacks on all workers in the same process that +/// share the same namespace. +pub(crate) struct SharedNamespaceWorker { + heartbeat_map: Arc>>, + namespace: String, + cancel: CancellationToken, } -impl WorkerHeartbeatManager { +impl SharedNamespaceWorker { pub(crate) fn new( - config: WorkerConfig, - identity: String, - heartbeat_fn: Arc>, client: Arc, - ) -> Self { - let sdk_name_and_ver = client.sdk_name_and_version(); - let reset_notify = Arc::new(Notify::new()); - let data = Arc::new(Mutex::new(WorkerHeartbeatData::new( + namespace: String, + heartbeat_interval: Duration, + telemetry: Option, + ) -> Result { + let config = WorkerConfigBuilder::default() + .namespace(namespace.clone()) + .task_queue(format!( + "temporal-sys/worker-commands/{namespace}/{}", + client.worker_grouping_key(), + )) + .no_remote_activities(true) + .max_outstanding_nexus_tasks(5_usize) + .versioning_strategy(WorkerVersioningStrategy::None { + build_id: "1.0".to_owned(), + }) + .nexus_task_poller_behavior(PollerBehavior::SimpleMaximum(1_usize)) + .build() + .expect("all required fields should be implemented"); + let worker = crate::worker::Worker::new_with_pollers( config, - identity, - sdk_name_and_ver, - reset_notify.clone(), - ))); - let data_clone = data.clone(); - - let heartbeat_handle = tokio::spawn(async move { - let mut ticker = tokio::time::interval(data_clone.lock().heartbeat_interval); - ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); + None, + client.clone(), + TaskPollers::Real, + telemetry, + None, + true, + )?; + + let reset_notify = Arc::new(Notify::new()); + let cancel = CancellationToken::new(); + let cancel_clone = cancel.clone(); + + let client_clone = client; + let namespace_clone = namespace.clone(); + + let heartbeat_map = Arc::new(Mutex::new(HashMap::::new())); + let heartbeat_map_clone = heartbeat_map.clone(); + + tokio::spawn(async move { + let mut ticker = tokio::time::interval(heartbeat_interval); loop { tokio::select! { _ = ticker.tick() => { - let heartbeat = if let Some(heartbeat) = data_clone.lock().capture_heartbeat_if_needed() { - heartbeat - } else { - continue - }; - if let Err(e) = client.clone().record_worker_heartbeat(heartbeat).await { - if matches!( - e.code(), - tonic::Code::Unimplemented - ) { + let mut hb_to_send = Vec::new(); + for (_instance_key, heartbeat_callback) in heartbeat_map_clone.lock().iter() { + let mut heartbeat = heartbeat_callback(); + // All of these heartbeat details rely on a client. To avoid circular + // dependencies, this must be populated from within SharedNamespaceWorker + // to get info from the current client + client_clone.set_heartbeat_client_fields(&mut heartbeat); + hb_to_send.push(heartbeat); + } + if let Err(e) = client_clone.record_worker_heartbeat(namespace_clone.clone(), hb_to_send).await { + if matches!(e.code(), tonic::Code::Unimplemented) { return; } warn!(error=?e, "Network error while sending worker heartbeat"); @@ -58,131 +90,76 @@ impl WorkerHeartbeatManager { _ = reset_notify.notified() => { ticker.reset(); } + _ = cancel_clone.cancelled() => { + worker.shutdown().await; + return; + } } } }); - let data_clone = data.clone(); - if heartbeat_fn - .set(Box::new(move || { - data_clone.lock().capture_heartbeat_if_needed() - })) - .is_err() - { - dbg_panic!( - "Failed to set heartbeat_fn, heartbeat_fn should only be set once, when a singular WorkerHeartbeatInfo is created" - ); - } - - Self { heartbeat_handle } - } - - pub(crate) fn shutdown(&self) { - self.heartbeat_handle.abort() + Ok(Self { + heartbeat_map, + namespace, + cancel, + }) } } -#[derive(Debug, Clone)] -struct WorkerHeartbeatData { - worker_instance_key: String, - worker_identity: String, - host_info: WorkerHostInfo, - // Time of the last heartbeat. This is used to both for heartbeat_time and last_heartbeat_time - heartbeat_time: Option, - task_queue: String, - /// SDK name - sdk_name: String, - /// SDK version - sdk_version: String, - /// Worker start time - start_time: SystemTime, - heartbeat_interval: Duration, - reset_notify: Arc, -} +impl SharedNamespaceWorkerTrait for SharedNamespaceWorker { + fn namespace(&self) -> String { + self.namespace.clone() + } -impl WorkerHeartbeatData { - fn new( - worker_config: WorkerConfig, - worker_identity: String, - sdk_name_and_ver: (String, String), - reset_notify: Arc, - ) -> Self { - Self { - worker_identity, - host_info: WorkerHostInfo { - host_name: gethostname().to_string_lossy().to_string(), - process_id: std::process::id().to_string(), - ..Default::default() - }, - sdk_name: sdk_name_and_ver.0, - sdk_version: sdk_name_and_ver.1, - task_queue: worker_config.task_queue.clone(), - start_time: SystemTime::now(), - heartbeat_time: None, - worker_instance_key: Uuid::new_v4().to_string(), - heartbeat_interval: worker_config - .heartbeat_interval - .expect("WorkerHeartbeatData is only called when heartbeat_interval is Some"), - reset_notify, + fn register_callback(&self, worker_instance_key: Uuid, heartbeat_callback: HeartbeatFn) { + self.heartbeat_map + .lock() + .insert(worker_instance_key, heartbeat_callback); + } + fn unregister_callback(&self, worker_instance_key: Uuid) -> (Option, bool) { + let mut heartbeat_map = self.heartbeat_map.lock(); + let heartbeat_callback = heartbeat_map.remove(&worker_instance_key); + if heartbeat_map.is_empty() { + self.cancel.cancel(); } + (heartbeat_callback, heartbeat_map.is_empty()) } - fn capture_heartbeat_if_needed(&mut self) -> Option { - let now = SystemTime::now(); - let elapsed_since_last_heartbeat = if let Some(heartbeat_time) = self.heartbeat_time { - let dur = now.duration_since(heartbeat_time).unwrap_or(Duration::ZERO); - - // Only send poll data if it's nearly been a full interval since this data has been sent - // In this case, "nearly" is 90% of the interval - if dur.as_secs_f64() < 0.9 * self.heartbeat_interval.as_secs_f64() { - return None; - } - Some(PbDuration { - seconds: dur.as_secs() as i64, - nanos: dur.subsec_nanos() as i32, - }) - } else { - None - }; - - self.heartbeat_time = Some(now); - - self.reset_notify.notify_one(); - - Some(WorkerHeartbeat { - worker_instance_key: self.worker_instance_key.clone(), - worker_identity: self.worker_identity.clone(), - host_info: Some(self.host_info.clone()), - task_queue: self.task_queue.clone(), - sdk_name: self.sdk_name.clone(), - sdk_version: self.sdk_version.clone(), - status: 0, - start_time: Some(self.start_time.into()), - heartbeat_time: Some(SystemTime::now().into()), - elapsed_since_last_heartbeat, - ..Default::default() - }) + fn num_workers(&self) -> usize { + self.heartbeat_map.lock().len() } } #[cfg(test)] mod tests { - use super::*; use crate::{ test_help::{WorkerExt, test_worker_cfg}, worker, worker::client::mocks::mock_worker_client, }; - use std::{sync::Arc, time::Duration}; + use std::{ + sync::{ + Arc, + atomic::{AtomicUsize, Ordering}, + }, + time::Duration, + }; use temporal_sdk_core_api::worker::PollerBehavior; use temporal_sdk_core_protos::temporal::api::workflowservice::v1::RecordWorkerHeartbeatResponse; #[tokio::test] - async fn worker_heartbeat() { + async fn worker_heartbeat_basic() { let mut mock = mock_worker_client(); - mock.expect_record_worker_heartbeat() - .times(2) - .returning(move |heartbeat| { + let heartbeat_count = Arc::new(AtomicUsize::new(0)); + let heartbeat_count_clone = heartbeat_count.clone(); + mock.expect_poll_workflow_task() + .returning(move |_namespace, _task_queue| Ok(Default::default())); + mock.expect_poll_nexus_task() + .returning(move |_poll_options, _send_heartbeat| Ok(Default::default())); + mock.expect_record_worker_heartbeat().times(3).returning( + move |_namespace, worker_heartbeat| { + assert_eq!(1, worker_heartbeat.len()); + let heartbeat = worker_heartbeat[0].clone(); let host_info = heartbeat.host_info.clone().unwrap(); assert_eq!("test-identity", heartbeat.worker_identity); assert!(!heartbeat.worker_instance_key.is_empty()); @@ -193,38 +170,34 @@ mod tests { assert_eq!(host_info.process_id, std::process::id().to_string()); assert_eq!(heartbeat.sdk_name, "test-core"); assert_eq!(heartbeat.sdk_version, "0.0.0"); - assert_eq!(heartbeat.task_queue, "q"); assert!(heartbeat.heartbeat_time.is_some()); assert!(heartbeat.start_time.is_some()); + heartbeat_count_clone.fetch_add(1, Ordering::Relaxed); + Ok(RecordWorkerHeartbeatResponse {}) - }); + }, + ); let config = test_worker_cfg() .activity_task_poller_behavior(PollerBehavior::SimpleMaximum(1_usize)) .max_outstanding_activities(1_usize) - .heartbeat_interval(Duration::from_millis(200)) .build() .unwrap(); - let heartbeat_fn = Arc::new(OnceLock::new()); let client = Arc::new(mock); - let worker = worker::Worker::new(config, None, client, None, Some(heartbeat_fn.clone())); - heartbeat_fn.get().unwrap()(); - - // heartbeat timer fires once - advance_time(Duration::from_millis(300)).await; - // it hasn't been >90% of the interval since the last heartbeat, so no data should be returned here - assert_eq!(None, heartbeat_fn.get().unwrap()()); - // heartbeat timer fires once - advance_time(Duration::from_millis(300)).await; - + let worker = worker::Worker::new( + config, + None, + client.clone(), + None, + Some(Duration::from_millis(100)), + ) + .unwrap(); + + tokio::time::sleep(Duration::from_millis(250)).await; worker.drain_activity_poller_and_shutdown().await; - } - async fn advance_time(dur: Duration) { - tokio::time::pause(); - tokio::time::advance(dur).await; - tokio::time::resume(); + assert_eq!(3, heartbeat_count.load(Ordering::Relaxed)); } } diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index a2faf5f10..6ecfca4b8 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -1,6 +1,6 @@ mod activities; pub(crate) mod client; -mod heartbeat; +pub(crate) mod heartbeat; mod nexus; mod slot_provider; pub(crate) mod tuner; @@ -8,10 +8,11 @@ mod workflow; pub use temporal_sdk_core_api::worker::{WorkerConfig, WorkerConfigBuilder}; pub use tuner::{ - FixedSizeSlotSupplier, RealSysInfo, ResourceBasedSlotsOptions, - ResourceBasedSlotsOptionsBuilder, ResourceBasedTuner, ResourceSlotOptions, SlotSupplierOptions, - TunerBuilder, TunerHolder, TunerHolderOptions, TunerHolderOptionsBuilder, + FixedSizeSlotSupplier, ResourceBasedSlotsOptions, ResourceBasedSlotsOptionsBuilder, + ResourceBasedTuner, ResourceSlotOptions, SlotSupplierOptions, TunerBuilder, TunerHolder, + TunerHolderOptions, TunerHolderOptionsBuilder, }; +pub(crate) use tuner::{RealSysInfo, SystemResourceInfo}; pub(crate) use activities::{ ExecutingLAId, LocalActRequest, LocalActivityExecutionResult, LocalActivityResolution, @@ -22,11 +23,13 @@ pub(crate) use wft_poller::WFTPollerShared; #[allow(unreachable_pub)] // re-exported in test_help::integ_helpers pub use workflow::LEGACY_QUERY_ID; +use crate::telemetry::WorkerHeartbeatMetrics; +use crate::worker::heartbeat::{HeartbeatFn, SharedNamespaceWorker}; use crate::{ ActivityHeartbeat, CompleteActivityError, PollError, WorkerTrait, abstractions::{MeteredPermitDealer, PermitDealerContextData, dbg_panic}, errors::CompleteWfError, - pollers::{ActivityTaskOptions, BoxedActPoller, BoxedNexusPoller, LongPollBuffer}, + pollers::{BoxedActPoller, BoxedNexusPoller}, protosext::validate_activity_completion, sealed::AnyClient, telemetry::{ @@ -39,32 +42,49 @@ use crate::{ worker::{ activities::{LACompleteAction, LocalActivityManager, NextPendingLAAction}, client::WorkerClient, - heartbeat::{HeartbeatFn, WorkerHeartbeatManager}, nexus::NexusManager, workflow::{ - LAReqSink, LocalResolution, WorkflowBasics, Workflows, wft_poller, - wft_poller::make_wft_poller, + LAReqSink, LocalResolution, WorkflowBasics, Workflows, wft_poller::make_wft_poller, }, }, }; +use crate::{ + pollers::{ActivityTaskOptions, LongPollBuffer}, + worker::workflow::wft_poller, +}; use activities::WorkerActivityTasks; +use anyhow::bail; +use crossbeam_utils::atomic::AtomicCell; use futures_util::{StreamExt, stream}; -use parking_lot::Mutex; +use gethostname::gethostname; +use parking_lot::{Mutex, RwLock}; use slot_provider::SlotProvider; +use std::sync::atomic::AtomicU64; +use std::time::SystemTime; use std::{ convert::TryInto, future, sync::{ - Arc, OnceLock, + Arc, atomic::{AtomicBool, Ordering}, }, time::Duration, }; -use temporal_client::WorkerKey; +use temporal_client::SharedNamespaceWorkerTrait; +use temporal_client::{ClientWorker, HeartbeatCallback, Slot as SlotTrait}; +use temporal_sdk_core_api::telemetry::metrics::TemporalMeter; +use temporal_sdk_core_api::worker::{ + ActivitySlotKind, LocalActivitySlotKind, NexusSlotKind, SlotKind, WorkflowSlotKind, +}; use temporal_sdk_core_api::{ errors::{CompleteNexusError, WorkerValidationError}, worker::PollerBehavior, }; +use temporal_sdk_core_protos::temporal::api::deployment; +use temporal_sdk_core_protos::temporal::api::enums::v1::WorkerStatus; +use temporal_sdk_core_protos::temporal::api::worker::v1::{ + WorkerHeartbeat, WorkerHostInfo, WorkerPollerInfo, WorkerSlotsInfo, +}; use temporal_sdk_core_protos::{ TaskToken, coresdk::{ @@ -83,7 +103,8 @@ use temporal_sdk_core_protos::{ use tokio::sync::{mpsc::unbounded_channel, watch}; use tokio_stream::wrappers::UnboundedReceiverStream; use tokio_util::sync::CancellationToken; - +use tracing::Subscriber; +use uuid::Uuid; #[cfg(any(feature = "test-utilities", test))] use { crate::{ @@ -100,8 +121,8 @@ use { pub struct Worker { config: WorkerConfig, client: Arc, - /// Registration key to enable eager workflow start for this worker - worker_key: Mutex>, + /// Worker instance key, unique identifier for this worker + worker_instance_key: Uuid, /// Manages all workflows and WFT processing workflows: Workflows, /// Manages activity tasks for this worker/task queue @@ -121,8 +142,10 @@ pub struct Worker { local_activities_complete: Arc, /// Used to track all permits have been released all_permits_tracker: tokio::sync::Mutex, - /// Used to shutdown the worker heartbeat task - worker_heartbeat: Option, + /// Used to track worker client + client_worker_registrator: Arc, + /// Status of the worker + status: Arc>, } struct AllPermitsTracker { @@ -139,6 +162,13 @@ impl AllPermitsTracker { } } +#[derive(Clone)] +pub(crate) struct WorkerTelemetry { + metric_meter: Option, + temporal_metric_meter: Option, + trace_subscriber: Option>, +} + #[async_trait::async_trait] impl WorkerTrait for Worker { async fn validate(&self) -> Result<(), WorkerValidationError> { @@ -234,10 +264,17 @@ impl WorkerTrait for Worker { ); } self.shutdown_token.cancel(); - // First, disable Eager Workflow Start - if let Some(key) = *self.worker_key.lock() { - self.client.workers().unregister(key); + { + *self.status.lock() = WorkerStatus::ShuttingDown; + } + // First, unregister worker from the client + if !self.client_worker_registrator.shared_namespace_worker { + let _res = self + .client + .workers() + .unregister_worker(self.worker_instance_key); } + // Second, we want to stop polling of both activity and workflow tasks if let Some(atm) = self.at_task_mgr.as_ref() { atm.initiate_shutdown(); @@ -263,6 +300,10 @@ impl WorkerTrait for Worker { async fn finalize_shutdown(self) { self.finalize_shutdown().await } + + fn worker_instance_key(&self) -> Uuid { + self.worker_instance_key + } } impl Worker { @@ -275,38 +316,62 @@ impl Worker { sticky_queue_name: Option, client: Arc, telem_instance: Option<&TelemetryInstance>, - heartbeat_fn: Option>>, - ) -> Self { + worker_heartbeat_interval: Option, + ) -> Result { info!(task_queue=%config.task_queue, namespace=%config.namespace, "Initializing worker"); + let worker_telemetry = telem_instance.map(|telem| WorkerTelemetry { + metric_meter: telem.get_metric_meter(), + temporal_metric_meter: telem.get_temporal_metric_meter(), + trace_subscriber: telem.trace_subscriber(), + }); + Self::new_with_pollers( config, sticky_queue_name, client, TaskPollers::Real, - telem_instance, - heartbeat_fn, + worker_telemetry, + worker_heartbeat_interval, + false, ) } - /// Replace client. For eager workflow purposes, this new client will now apply to future - /// eager start requests and the older client will not. - pub fn replace_client(&self, new_client: CT) + /// Replace client. + /// + /// For eager workflow purposes, this new client will now apply to future eager start requests + /// and the older client will not. Note, if this registration fails, the worker heartbeat will + /// also not be registered. + /// + /// For worker heartbeat, this will remove an existing shared worker if it is the last worker of + /// the old client and create a new nexus worker if it's the first client of the namespace on + /// the new client. + pub fn replace_client(&self, new_client: CT) -> Result<(), anyhow::Error> where CT: Into, { // Unregister worker from current client, register in new client at the end - let mut worker_key = self.worker_key.lock(); - let slot_provider = (*worker_key).and_then(|k| self.client.workers().unregister(k)); + let client_worker = self + .client + .workers() + .unregister_worker(self.worker_instance_key)?; + + let new_worker_client = super::init_worker_client( + self.config.namespace.clone(), + self.config.client_identity_override.clone(), + new_client, + ); + + self.client.replace_client(new_worker_client); + *self.client_worker_registrator.client.write() = self.client.clone(); self.client - .replace_client(super::init_worker_client(&self.config, new_client)); - *worker_key = - slot_provider.and_then(|slot_provider| self.client.workers().register(slot_provider)); + .workers() + .register_worker(client_worker, self.config.skip_client_worker_set_check) } #[cfg(test)] pub(crate) fn new_test(config: WorkerConfig, client: impl WorkerClient + 'static) -> Self { - Self::new(config, None, Arc::new(client), None, None) + Self::new(config, None, Arc::new(client), None, None).unwrap() } pub(crate) fn new_with_pollers( @@ -314,28 +379,36 @@ impl Worker { sticky_queue_name: Option, client: Arc, task_pollers: TaskPollers, - telem_instance: Option<&TelemetryInstance>, - heartbeat_fn: Option>>, - ) -> Self { - let (metrics, meter) = if let Some(ti) = telem_instance { + worker_telemetry: Option, + worker_heartbeat_interval: Option, + shared_namespace_worker: bool, + ) -> Result { + let (metrics, meter) = if let Some(wt) = worker_telemetry.as_ref() { ( - MetricsContext::top_level(config.namespace.clone(), config.task_queue.clone(), ti), - ti.get_metric_meter(), + MetricsContext::top_level_with_meter( + config.namespace.clone(), + config.task_queue.clone(), + wt.temporal_metric_meter.clone(), + ), + wt.metric_meter.clone(), ) } else { (MetricsContext::no_op(), None) }; - let tuner = config - .tuner - .as_ref() - .cloned() - .unwrap_or_else(|| Arc::new(TunerBuilder::from_config(&config).build())); + + let mut sys_info = None; + let tuner = config.tuner.as_ref().cloned().unwrap_or_else(|| { + let mut tuner_builder = TunerBuilder::from_config(&config); + sys_info = tuner_builder.get_sys_info(); + Arc::new(tuner_builder.build()) + }); + let sys_info = sys_info.unwrap_or_else(|| Arc::new(RealSysInfo::new())); metrics.worker_registered(); let shutdown_token = CancellationToken::new(); let slot_context_data = Arc::new(PermitDealerContextData { task_queue: config.task_queue.clone(), - worker_identity: client.get_identity(), + worker_identity: client.identity(), worker_deployment_version: config.computed_deployment_version(), }); let wft_slots = MeteredPermitDealer::new( @@ -361,6 +434,12 @@ impl Worker { ); let act_permits = act_slots.get_extant_count_rcv(); let (external_wft_tx, external_wft_rx) = unbounded_channel(); + + let wf_last_suc_poll_time = Arc::new(AtomicCell::new(None)); + let wf_sticky_last_suc_poll_time = Arc::new(AtomicCell::new(None)); + let act_last_suc_poll_time = Arc::new(AtomicCell::new(None)); + let nexus_last_suc_poll_time = Arc::new(AtomicCell::new(None)); + let nexus_slots = MeteredPermitDealer::new( tuner.nexus_task_slot_supplier(), metrics.with_new_attrs([nexus_worker_type()]), @@ -377,6 +456,8 @@ impl Worker { &metrics, &shutdown_token, &wft_slots, + wf_last_suc_poll_time.clone(), + wf_sticky_last_suc_poll_time.clone(), ); let wft_stream = if !client.is_mock() { // Some replay tests combine a mock client with real pollers, @@ -402,11 +483,13 @@ impl Worker { max_worker_acts_per_second: config.max_worker_activities_per_second, max_tps: config.max_task_queue_activities_per_second, }, + act_last_suc_poll_time.clone(), ); Some(Box::from(ap) as BoxedActPoller) }; let np_metrics = metrics.with_new_attrs([nexus_poller()]); + let nexus_poll_buffer = Box::new(LongPollBuffer::new_nexus_task( client.clone(), config.task_queue.clone(), @@ -414,6 +497,8 @@ impl Worker { nexus_slots.clone(), shutdown_token.child_token(), Some(move |np| np_metrics.record_num_pollers(np)), + nexus_last_suc_poll_time.clone(), + shared_namespace_worker, )) as BoxedNexusPoller; #[cfg(any(feature = "test-utilities", test))] @@ -457,13 +542,13 @@ impl Worker { let la_permits = la_permit_dealer.get_extant_count_rcv(); let local_act_mgr = Arc::new(LocalActivityManager::new( config.namespace.clone(), - la_permit_dealer, + la_permit_dealer.clone(), hb_tx, metrics.clone(), )); let at_task_mgr = act_poller.map(|ap| { WorkerActivityTasks::new( - act_slots, + act_slots.clone(), ap, client.clone(), metrics.clone(), @@ -474,7 +559,7 @@ impl Worker { ) }); let poll_on_non_local_activities = at_task_mgr.is_some(); - if !poll_on_non_local_activities { + if !poll_on_non_local_activities && !shared_namespace_worker { info!("Activity polling is disabled for this worker"); }; let la_sink = LAReqSink::new(local_act_mgr.clone()); @@ -499,20 +584,50 @@ impl Worker { external_wft_tx, deployment_options, ); - let worker_key = Mutex::new(client.workers().register(Box::new(provider))); - let sdk_name_and_ver = client.sdk_name_and_version(); + let worker_instance_key = Uuid::new_v4(); + let worker_status = Arc::new(Mutex::new(WorkerStatus::Running)); - let worker_heartbeat = heartbeat_fn.map(|heartbeat_fn| { + let sdk_name_and_ver = client.sdk_name_and_version(); + let worker_heartbeat = worker_heartbeat_interval.map(|hb_interval| { + let hb_metrics = HeartbeatMetrics { + in_mem_metrics: metrics.in_memory_meter(), + wft_slots: wft_slots.clone(), + act_slots, + nexus_slots, + la_slots: la_permit_dealer, + wf_last_suc_poll_time, + wf_sticky_last_suc_poll_time, + act_last_suc_poll_time, + nexus_last_suc_poll_time, + status: worker_status.clone(), + sys_info, + }; WorkerHeartbeatManager::new( config.clone(), - client.get_identity(), - heartbeat_fn, - client.clone(), + worker_instance_key, + hb_interval, + worker_telemetry.clone(), + hb_metrics, ) }); - Self { - worker_key, + let client_worker_registrator = Arc::new(ClientWorkerRegistrator { + worker_instance_key, + slot_provider: provider, + heartbeat_manager: worker_heartbeat, + client: RwLock::new(client.clone()), + shared_namespace_worker, + }); + + if !shared_namespace_worker { + client.workers().register_worker( + client_worker_registrator.clone(), + config.skip_client_worker_set_check, + )?; + } + + Ok(Self { + worker_instance_key, client: client.clone(), workflows: Workflows::new( WorkflowBasics { @@ -551,7 +666,9 @@ impl Worker { _ => Some(mgr.get_handle_for_workflows()), } }), - telem_instance, + worker_telemetry + .as_ref() + .and_then(|telem| telem.trace_subscriber.clone()), ), at_task_mgr, local_act_mgr, @@ -567,8 +684,9 @@ impl Worker { la_permits, }), nexus_mgr, - worker_heartbeat, - } + client_worker_registrator, + status: worker_status, + }) } /// Will shutdown the worker. Does not resolve until all outstanding workflow tasks have been @@ -576,8 +694,14 @@ impl Worker { async fn shutdown(&self) { self.initiate_shutdown(); if let Some(name) = self.workflows.get_sticky_queue_name() { + let heartbeat = self + .client_worker_registrator + .heartbeat_manager + .as_ref() + .map(|hm| hm.heartbeat_callback.clone()()); + // This is a best effort call and we can still shutdown the worker if it fails - match self.client.shutdown_worker(name).await { + match self.client.shutdown_worker(name, heartbeat).await { Err(err) if !matches!( err.code(), @@ -612,9 +736,6 @@ impl Worker { dbg_panic!("Waiting for all slot permits to release took too long!"); } } - if let Some(heartbeat) = self.worker_heartbeat.as_ref() { - heartbeat.shutdown(); - } } /// Finish shutting down by consuming the background pollers and freeing all resources @@ -871,6 +992,225 @@ impl Worker { } } +struct ClientWorkerRegistrator { + worker_instance_key: Uuid, + slot_provider: SlotProvider, + heartbeat_manager: Option, + client: RwLock>, + shared_namespace_worker: bool, +} + +impl ClientWorker for ClientWorkerRegistrator { + fn namespace(&self) -> &str { + self.slot_provider.namespace() + } + fn task_queue(&self) -> &str { + self.slot_provider.task_queue() + } + + fn try_reserve_wft_slot(&self) -> Option> { + self.slot_provider.try_reserve_wft_slot() + } + + fn deployment_options(&self) -> Option { + self.slot_provider.deployment_options() + } + + fn worker_instance_key(&self) -> Uuid { + self.worker_instance_key + } + + fn heartbeat_enabled(&self) -> bool { + self.heartbeat_manager.is_some() + } + + fn heartbeat_callback(&self) -> Option { + if let Some(hb_mgr) = self.heartbeat_manager.as_ref() { + Some(hb_mgr.heartbeat_callback.clone()) + } else { + None + } + } + + fn new_shared_namespace_worker( + &self, + ) -> Result, anyhow::Error> { + if let Some(ref hb_mgr) = self.heartbeat_manager { + Ok(Box::new(SharedNamespaceWorker::new( + self.client.read().clone(), + self.namespace().to_string(), + hb_mgr.heartbeat_interval, + hb_mgr.telemetry.clone(), + )?)) + } else { + bail!("Shared namespace worker creation never be called without a heartbeat manager"); + } + } +} + +struct HeartbeatMetrics { + in_mem_metrics: Option>, + wft_slots: MeteredPermitDealer, + act_slots: MeteredPermitDealer, + nexus_slots: MeteredPermitDealer, + la_slots: MeteredPermitDealer, + wf_last_suc_poll_time: Arc>>, + wf_sticky_last_suc_poll_time: Arc>>, + act_last_suc_poll_time: Arc>>, + nexus_last_suc_poll_time: Arc>>, + status: Arc>, + sys_info: Arc, +} + +struct WorkerHeartbeatManager { + /// Heartbeat interval, defaults to 60s + heartbeat_interval: Duration, + /// Telemetry instance, needed to initialize [SharedNamespaceWorker] when replacing client + telemetry: Option, + /// Heartbeat callback + heartbeat_callback: Arc WorkerHeartbeat + Send + Sync>, +} + +impl WorkerHeartbeatManager { + fn new( + config: WorkerConfig, + worker_instance_key: Uuid, + heartbeat_interval: Duration, + telemetry_instance: Option, + heartbeat_manager_metrics: HeartbeatMetrics, + ) -> Self { + let start_time = Some(SystemTime::now().into()); + let worker_heartbeat_callback: HeartbeatFn = Arc::new(move || { + let deployment_version = config.computed_deployment_version().map(|dv| { + deployment::v1::WorkerDeploymentVersion { + deployment_name: dv.deployment_name, + build_id: dv.build_id, + } + }); + + let mut worker_heartbeat = WorkerHeartbeat { + worker_instance_key: worker_instance_key.to_string(), + host_info: Some(WorkerHostInfo { + host_name: gethostname().to_string_lossy().to_string(), + process_id: std::process::id().to_string(), + current_host_cpu_usage: heartbeat_manager_metrics.sys_info.used_cpu_percent() + as f32, + current_host_mem_usage: heartbeat_manager_metrics.sys_info.used_mem_percent() + as f32, + + // Set by SharedNamespaceWorker because it relies on the client + process_key: String::new(), + }), + task_queue: config.task_queue.clone(), + deployment_version, + + status: (*heartbeat_manager_metrics.status.lock()) as i32, + start_time, + plugins: config.plugins.clone(), + + // Some Metrics dependent fields are set below, and + // some fields like sdk_name, sdk_version, and worker_identity, must be set by + // SharedNamespaceWorker because they rely on the client, and + // need to be pulled from the current client used by SharedNamespaceWorker + ..Default::default() + }; + + if let Some(in_mem) = heartbeat_manager_metrics.in_mem_metrics.as_ref() { + worker_heartbeat.total_sticky_cache_hit = + in_mem.total_sticky_cache_hit.load(Ordering::Relaxed) as i32; + worker_heartbeat.total_sticky_cache_miss = + in_mem.total_sticky_cache_miss.load(Ordering::Relaxed) as i32; + worker_heartbeat.current_sticky_cache_size = + in_mem.sticky_cache_size.load(Ordering::Relaxed) as i32; + + worker_heartbeat.workflow_poller_info = Some(WorkerPollerInfo { + current_pollers: in_mem + .num_pollers + .wft_current_pollers + .load(Ordering::Relaxed) as i32, + last_successful_poll_time: heartbeat_manager_metrics + .wf_last_suc_poll_time + .load() + .map(|time| time.into()), + is_autoscaling: config.workflow_task_poller_behavior.is_autoscaling(), + }); + worker_heartbeat.workflow_sticky_poller_info = Some(WorkerPollerInfo { + current_pollers: in_mem + .num_pollers + .sticky_wft_current_pollers + .load(Ordering::Relaxed) as i32, + last_successful_poll_time: heartbeat_manager_metrics + .wf_sticky_last_suc_poll_time + .load() + .map(|time| time.into()), + is_autoscaling: config.workflow_task_poller_behavior.is_autoscaling(), + }); + worker_heartbeat.activity_poller_info = Some(WorkerPollerInfo { + current_pollers: in_mem + .num_pollers + .activity_current_pollers + .load(Ordering::Relaxed) as i32, + last_successful_poll_time: heartbeat_manager_metrics + .act_last_suc_poll_time + .load() + .map(|time| time.into()), + is_autoscaling: config.activity_task_poller_behavior.is_autoscaling(), + }); + worker_heartbeat.nexus_poller_info = Some(WorkerPollerInfo { + current_pollers: in_mem + .num_pollers + .nexus_current_pollers + .load(Ordering::Relaxed) as i32, + last_successful_poll_time: heartbeat_manager_metrics + .nexus_last_suc_poll_time + .load() + .map(|time| time.into()), + is_autoscaling: config.nexus_task_poller_behavior.is_autoscaling(), + }); + + worker_heartbeat.workflow_task_slots_info = make_slots_info( + &heartbeat_manager_metrics.wft_slots, + in_mem.worker_task_slots_available.workflow_worker.clone(), + in_mem.worker_task_slots_used.workflow_worker.clone(), + in_mem.workflow_task_execution_latency.clone(), + in_mem.workflow_task_execution_failed.clone(), + ); + worker_heartbeat.activity_task_slots_info = make_slots_info( + &heartbeat_manager_metrics.act_slots, + in_mem.worker_task_slots_available.activity_worker.clone(), + in_mem.worker_task_slots_used.activity_worker.clone(), + in_mem.activity_execution_latency.clone(), + in_mem.activity_execution_failed.clone(), + ); + worker_heartbeat.nexus_task_slots_info = make_slots_info( + &heartbeat_manager_metrics.nexus_slots, + in_mem.worker_task_slots_available.nexus_worker.clone(), + in_mem.worker_task_slots_used.nexus_worker.clone(), + in_mem.nexus_task_execution_latency.clone(), + in_mem.nexus_task_execution_failed.clone(), + ); + worker_heartbeat.local_activity_slots_info = make_slots_info( + &heartbeat_manager_metrics.la_slots, + in_mem + .worker_task_slots_available + .local_activity_worker + .clone(), + in_mem.worker_task_slots_used.local_activity_worker.clone(), + in_mem.local_activity_execution_latency.clone(), + in_mem.local_activity_execution_failed.clone(), + ); + } + worker_heartbeat + }); + + WorkerHeartbeatManager { + heartbeat_interval, + telemetry: telemetry_instance, + heartbeat_callback: worker_heartbeat_callback, + } + } +} + pub(crate) struct PostActivateHookData<'a> { pub(crate) run_id: &'a str, pub(crate) replaying: bool, @@ -908,6 +1248,31 @@ fn wft_poller_behavior(config: &WorkerConfig, is_sticky: bool) -> PollerBehavior } } +fn make_slots_info( + dealer: &MeteredPermitDealer, + slots_available: Arc, + slots_used: Arc, + total_processed: Arc, + total_failed: Arc, +) -> Option +where + SK: SlotKind + 'static, +{ + Some(WorkerSlotsInfo { + current_available_slots: i32::try_from(slots_available.load(Ordering::Relaxed)) + .unwrap_or(-1), + current_used_slots: i32::try_from(slots_used.load(Ordering::Relaxed)).unwrap_or(-1), + slot_supplier_kind: dealer.slot_supplier_kind().to_string(), + total_processed_tasks: i32::try_from(total_processed.load(Ordering::Relaxed)) + .unwrap_or(i32::MIN), + total_failed_tasks: i32::try_from(total_failed.load(Ordering::Relaxed)).unwrap_or(i32::MIN), + + // Filled in by heartbeat later + last_interval_processed_tasks: 0, + last_interval_failure_tasks: 0, + }) +} + #[cfg(test)] mod tests { use super::*; diff --git a/core/src/worker/slot_provider.rs b/core/src/worker/slot_provider.rs index e150a7124..9139f6796 100644 --- a/core/src/worker/slot_provider.rs +++ b/core/src/worker/slot_provider.rs @@ -7,7 +7,7 @@ use crate::{ protosext::ValidPollWFTQResponse, worker::workflow::wft_poller::validate_wft, }; -use temporal_client::{Slot as SlotTrait, SlotProvider as SlotProviderTrait}; +use temporal_client::Slot as SlotTrait; use temporal_sdk_core_api::worker::WorkflowSlotKind; use temporal_sdk_core_protos::temporal::api::workflowservice::v1::PollWorkflowTaskQueueResponse; use tokio::sync::mpsc::UnboundedSender; @@ -77,22 +77,21 @@ impl SlotProvider { deployment_options, } } -} - -impl SlotProviderTrait for SlotProvider { - fn namespace(&self) -> &str { + pub(super) fn namespace(&self) -> &str { &self.namespace } - fn task_queue(&self) -> &str { + pub(super) fn task_queue(&self) -> &str { &self.task_queue } - fn try_reserve_wft_slot(&self) -> Option> { + pub(super) fn try_reserve_wft_slot(&self) -> Option> { match self.wft_semaphore.try_acquire_owned().ok() { Some(permit) => Some(Box::new(Slot::new(permit, self.external_wft_tx.clone()))), None => None, } } - fn deployment_options(&self) -> Option { + pub(super) fn deployment_options( + &self, + ) -> Option { self.deployment_options.clone() } } diff --git a/core/src/worker/tuner.rs b/core/src/worker/tuner.rs index ed592a6f7..0a7cadcc9 100644 --- a/core/src/worker/tuner.rs +++ b/core/src/worker/tuner.rs @@ -3,10 +3,12 @@ mod resource_based; pub use fixed_size::FixedSizeSlotSupplier; pub use resource_based::{ - RealSysInfo, ResourceBasedSlotsOptions, ResourceBasedSlotsOptionsBuilder, ResourceBasedTuner, + ResourceBasedSlotsOptions, ResourceBasedSlotsOptionsBuilder, ResourceBasedTuner, ResourceSlotOptions, }; +pub(crate) use resource_based::{RealSysInfo, SystemResourceInfo}; + use std::sync::Arc; use temporal_sdk_core_api::worker::{ ActivitySlotKind, LocalActivitySlotKind, NexusSlotKind, SlotKind, SlotSupplier, WorkerConfig, @@ -126,6 +128,9 @@ impl TunerHolderOptions { } None => {} } + if let Some(tuner) = rb_tuner { + builder.sys_info(tuner.sys_info()); + } Ok(builder.build()) } } @@ -187,6 +192,7 @@ pub struct TunerBuilder { local_activity_slot_supplier: Option + Send + Sync>>, nexus_slot_supplier: Option + Send + Sync>>, + sys_info: Option>, } impl TunerBuilder { @@ -243,6 +249,17 @@ impl TunerBuilder { self } + /// Sets a field that implements [SystemResourceInfo] + pub fn sys_info(&mut self, sys_info: Arc) -> &mut Self { + self.sys_info = Some(sys_info); + self + } + + /// Gets the field that implements [SystemResourceInfo] + pub fn get_sys_info(&self) -> Option> { + self.sys_info.clone() + } + /// Build a [WorkerTuner] from the configured slot suppliers pub fn build(&mut self) -> TunerHolder { TunerHolder { diff --git a/core/src/worker/tuner/fixed_size.rs b/core/src/worker/tuner/fixed_size.rs index aa737dc8b..e1bf53d6e 100644 --- a/core/src/worker/tuner/fixed_size.rs +++ b/core/src/worker/tuner/fixed_size.rs @@ -50,4 +50,8 @@ where fn available_slots(&self) -> Option { Some(self.sem.available_permits()) } + + fn slot_supplier_kind(&self) -> String { + "Fixed".to_string() + } } diff --git a/core/src/worker/tuner/resource_based.rs b/core/src/worker/tuner/resource_based.rs index 173418413..88606add3 100644 --- a/core/src/worker/tuner/resource_based.rs +++ b/core/src/worker/tuner/resource_based.rs @@ -1,11 +1,13 @@ use crossbeam_utils::atomic::AtomicCell; use parking_lot::Mutex; +use std::sync::mpsc; use std::{ marker::PhantomData, sync::{ Arc, OnceLock, atomic::{AtomicU64, AtomicUsize, Ordering}, }, + thread, time::{Duration, Instant}, }; use temporal_sdk_core_api::{ @@ -31,6 +33,8 @@ pub struct ResourceBasedTuner { act_opts: Option, la_opts: Option, nexus_opts: Option, + + sys_info: Arc, } impl ResourceBasedTuner { @@ -42,25 +46,28 @@ impl ResourceBasedTuner { .target_cpu_usage(target_cpu_usage) .build() .expect("default resource based slot options can't fail to build"); - let controller = ResourceController::new_with_sysinfo(opts, RealSysInfo::new()); + let controller = ResourceController::new_with_sysinfo(opts, Arc::new(RealSysInfo::new())); Self::new_from_controller(controller) } /// Create an instance using the fully configurable set of PID controller options pub fn new_from_options(options: ResourceBasedSlotsOptions) -> Self { - let controller = ResourceController::new_with_sysinfo(options, RealSysInfo::new()); + let controller = + ResourceController::new_with_sysinfo(options, Arc::new(RealSysInfo::new())); Self::new_from_controller(controller) } } impl ResourceBasedTuner { fn new_from_controller(controller: ResourceController) -> Self { + let sys_info = controller.sys_info_supplier.clone(); Self { slots: Arc::new(controller), wf_opts: None, act_opts: None, la_opts: None, nexus_opts: None, + sys_info, } } @@ -87,6 +94,11 @@ impl ResourceBasedTuner { self.nexus_opts = Some(opts); self } + + /// Get sys info + pub fn sys_info(&self) -> Arc { + self.sys_info.clone() + } } const DEFAULT_WF_SLOT_OPTS: ResourceSlotOptions = ResourceSlotOptions { @@ -121,7 +133,7 @@ pub struct ResourceSlotOptions { struct ResourceController { options: ResourceBasedSlotsOptions, - sys_info_supplier: MI, + sys_info_supplier: Arc, metrics: OnceLock>, pids: Mutex, last_metric_vals: Arc>, @@ -314,6 +326,10 @@ where } } } + + fn slot_supplier_kind(&self) -> String { + "ResourceBased".to_string() + } } impl ResourceBasedSlotsForType @@ -421,7 +437,7 @@ impl ResourceController { Arc::new(ResourceBasedSlotsForType::new(self.clone(), opts)) } - fn new_with_sysinfo(options: ResourceBasedSlotsOptions, sys_info: MI) -> Self { + fn new_with_sysinfo(options: ResourceBasedSlotsOptions, sys_info: Arc) -> Self { Self { pids: Mutex::new(PidControllers::new(&options)), options, @@ -474,37 +490,14 @@ impl ResourceController { /// Implements [SystemResourceInfo] using the [sysinfo] crate #[derive(Debug)] -pub struct RealSysInfo { +struct RealSysInfoInner { sys: Mutex, total_mem: AtomicU64, cur_mem_usage: AtomicU64, cur_cpu_usage: AtomicU64, - last_refresh: AtomicCell, } -impl RealSysInfo { - fn new() -> Self { - let mut sys = sysinfo::System::new(); - sys.refresh_memory(); - let total_mem = sys.total_memory(); - let s = Self { - sys: Mutex::new(sys), - last_refresh: AtomicCell::new(Instant::now()), - cur_mem_usage: AtomicU64::new(0), - cur_cpu_usage: AtomicU64::new(0), - total_mem: AtomicU64::new(total_mem), - }; - s.refresh(); - s - } - - fn refresh_if_needed(&self) { - // This is all quite expensive and meaningfully slows everything down if it's allowed to - // happen more often. A better approach than a lock would be needed to go faster. - if (Instant::now() - self.last_refresh.load()) > Duration::from_millis(100) { - self.refresh(); - } - } +impl RealSysInfoInner { fn refresh(&self) { let mut lock = self.sys.lock(); lock.refresh_memory(); @@ -522,25 +515,73 @@ impl RealSysInfo { self.cur_mem_usage.store(mem, Ordering::Release); } self.cur_cpu_usage.store(cpu.to_bits(), Ordering::Release); - self.last_refresh.store(Instant::now()); + } +} + +/// Tracks host resource usage by refreshing metrics on a background thread. +pub struct RealSysInfo { + inner: Arc, + shutdown_tx: mpsc::Sender<()>, + shutdown_handle: Mutex>>, +} + +impl RealSysInfo { + pub(crate) fn new() -> Self { + let mut sys = sysinfo::System::new(); + sys.refresh_memory(); + let total_mem = sys.total_memory(); + let inner = Arc::new(RealSysInfoInner { + sys: Mutex::new(sys), + cur_mem_usage: AtomicU64::new(0), + cur_cpu_usage: AtomicU64::new(0), + total_mem: AtomicU64::new(total_mem), + }); + inner.refresh(); + + let thread_clone = inner.clone(); + let (tx, rx) = mpsc::channel::<()>(); + let handle = thread::Builder::new() + .name("temporal-real-sysinfo".to_string()) + .spawn(move || { + const REFRESH_INTERVAL: Duration = Duration::from_millis(100); + loop { + thread_clone.refresh(); + let r = rx.recv_timeout(REFRESH_INTERVAL); + if matches!(r, Err(mpsc::RecvTimeoutError::Disconnected)) || r.is_ok() { + return; + } + } + }) + .expect("failed to spawn RealSysInfo refresh thread"); + + Self { + inner, + shutdown_tx: tx, + shutdown_handle: Mutex::new(Some(handle)), + } } } impl SystemResourceInfo for RealSysInfo { fn total_mem(&self) -> u64 { - self.total_mem.load(Ordering::Acquire) + self.inner.total_mem.load(Ordering::Acquire) } fn used_mem(&self) -> u64 { - // TODO: This should really happen on a background thread since it's getting called from - // the async reserve - self.refresh_if_needed(); - self.cur_mem_usage.load(Ordering::Acquire) + self.inner.cur_mem_usage.load(Ordering::Acquire) } fn used_cpu_percent(&self) -> f64 { - self.refresh_if_needed(); - f64::from_bits(self.cur_cpu_usage.load(Ordering::Acquire)) + f64::from_bits(self.inner.cur_cpu_usage.load(Ordering::Acquire)) + } +} + +impl Drop for RealSysInfo { + fn drop(&mut self) { + let _res = self.shutdown_tx.send(()); + if let Some(handle) = self.shutdown_handle.lock().take() { + let _ = handle.join(); + } } } @@ -558,9 +599,9 @@ mod tests { used: Arc, } impl FakeMIS { - fn new() -> (Self, Arc) { + fn new() -> (Arc, Arc) { let used = Arc::new(AtomicU64::new(0)); - (Self { used: used.clone() }, used) + (Arc::new(Self { used: used.clone() }), used) } } impl SystemResourceInfo for FakeMIS { diff --git a/core/src/worker/workflow/mod.rs b/core/src/worker/workflow/mod.rs index 77d07025f..c8103ba11 100644 --- a/core/src/worker/workflow/mod.rs +++ b/core/src/worker/workflow/mod.rs @@ -23,7 +23,7 @@ use crate::{ internal_flags::InternalFlags, pollers::TrackedPermittedTqResp, protosext::{ValidPollWFTQResponse, protocol_messages::IncomingProtocolMessage}, - telemetry::{TelemetryInstance, VecDisplayer, set_trace_subscriber_for_current_thread}, + telemetry::{VecDisplayer, set_trace_subscriber_for_current_thread}, worker::{ LocalActRequest, LocalActivityExecutionResult, LocalActivityResolution, PostActivateHookData, @@ -94,7 +94,7 @@ use tokio::{ }; use tokio_stream::wrappers::UnboundedReceiverStream; use tokio_util::sync::CancellationToken; -use tracing::Span; +use tracing::{Span, Subscriber}; /// Id used by server for "legacy" queries. IE: Queries that come in the `query` rather than /// `queries` field of a WFT, and are responded to on the separate `respond_query_task_completed` @@ -167,7 +167,7 @@ impl Workflows { local_act_mgr: Arc, heartbeat_timeout_rx: UnboundedReceiver, activity_tasks_handle: Option, - telem_instance: Option<&TelemetryInstance>, + tracing_sub: Option>, ) -> Self { let (local_tx, local_rx) = unbounded_channel(); let (fetch_tx, fetch_rx) = unbounded_channel(); @@ -188,7 +188,6 @@ impl Workflows { let (start_polling_tx, start_polling_rx) = oneshot::channel(); // We must spawn a task to constantly poll the activation stream, because otherwise // activation completions would not cause anything to happen until the next poll. - let tracing_sub = telem_instance.and_then(|ti| ti.trace_subscriber()); let processing_task = thread::Builder::new() .name("workflow-processing".to_string()) .spawn(move || { diff --git a/core/src/worker/workflow/wft_poller.rs b/core/src/worker/workflow/wft_poller.rs index 3cc2da579..0a00ad179 100644 --- a/core/src/worker/workflow/wft_poller.rs +++ b/core/src/worker/workflow/wft_poller.rs @@ -6,13 +6,16 @@ use crate::{ telemetry::metrics::{workflow_poller, workflow_sticky_poller}, worker::{client::WorkerClient, wft_poller_behavior}, }; +use crossbeam_utils::atomic::AtomicCell; use futures_util::{Stream, stream}; use std::sync::{Arc, OnceLock}; +use std::time::SystemTime; use temporal_sdk_core_api::worker::{WorkerConfig, WorkflowSlotKind}; use temporal_sdk_core_protos::temporal::api::workflowservice::v1::PollWorkflowTaskQueueResponse; use tokio::sync::watch; use tokio_util::sync::CancellationToken; +#[allow(clippy::too_many_arguments)] pub(crate) fn make_wft_poller( config: &WorkerConfig, sticky_queue_name: &Option, @@ -20,6 +23,8 @@ pub(crate) fn make_wft_poller( metrics: &MetricsContext, shutdown_token: &CancellationToken, wft_slots: &MeteredPermitDealer, + last_successful_poll_time: Arc>>, + sticky_last_successful_poll_time: Arc>>, ) -> impl Stream< Item = Result< ( @@ -52,6 +57,7 @@ pub(crate) fn make_wft_poller( WorkflowTaskOptions { wft_poller_shared: wft_poller_shared.clone(), }, + last_successful_poll_time, ); let sticky_queue_poller = sticky_queue_name.as_ref().map(|sqn| { let sticky_metrics = metrics.with_new_attrs([workflow_sticky_poller()]); @@ -66,6 +72,7 @@ pub(crate) fn make_wft_poller( sticky_metrics.record_num_pollers(np); }), WorkflowTaskOptions { wft_poller_shared }, + sticky_last_successful_poll_time, ) }); let wf_task_poll_buffer = Box::new(WorkflowTaskPoller::new( diff --git a/sdk/src/lib.rs b/sdk/src/lib.rs index 2161a5ea9..e3970b4e2 100644 --- a/sdk/src/lib.rs +++ b/sdk/src/lib.rs @@ -10,7 +10,7 @@ //! ```no_run //! use std::{str::FromStr, sync::Arc}; //! use temporal_sdk::{sdk_client_options, ActContext, Worker}; -//! use temporal_sdk_core::{init_worker, Url, CoreRuntime}; +//! use temporal_sdk_core::{init_worker, Url, CoreRuntime, RuntimeOptionsBuilder}; //! use temporal_sdk_core_api::{ //! worker::{WorkerConfigBuilder, WorkerVersioningStrategy}, //! telemetry::TelemetryOptionsBuilder @@ -20,10 +20,11 @@ //! async fn main() -> Result<(), Box> { //! let server_options = sdk_client_options(Url::from_str("http://localhost:7233")?).build()?; //! -//! let client = server_options.connect("default", None).await?; -//! //! let telemetry_options = TelemetryOptionsBuilder::default().build()?; -//! let runtime = CoreRuntime::new_assume_tokio(telemetry_options)?; +//! let runtime_options = RuntimeOptionsBuilder::default().telemetry_options(telemetry_options).build().unwrap(); +//! let runtime = CoreRuntime::new_assume_tokio(runtime_options)?; +//! +//! let client = server_options.connect("default", None).await?; //! //! let worker_config = WorkerConfigBuilder::default() //! .namespace("default") diff --git a/tests/common/mod.rs b/tests/common/mod.rs index a53d702d0..4933372ce 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -38,9 +38,10 @@ use temporal_sdk::{ WorkerInterceptor, }, }; +pub(crate) use temporal_sdk_core::test_help::NAMESPACE; use temporal_sdk_core::{ - ClientOptions, ClientOptionsBuilder, CoreRuntime, WorkerConfigBuilder, init_replay_worker, - init_worker, + ClientOptions, ClientOptionsBuilder, CoreRuntime, RuntimeOptions, RuntimeOptionsBuilder, + WorkerConfigBuilder, init_replay_worker, init_worker, replay::{HistoryForReplay, ReplayWorkerInput}, telemetry::{build_otlp_metric_exporter, start_prometheus_metric_exporter}, }; @@ -68,8 +69,7 @@ use tokio::{sync::OnceCell, task::AbortHandle}; use tonic::IntoRequest; use tracing::{debug, warn}; use url::Url; - -pub(crate) use temporal_sdk_core::test_help::NAMESPACE; +use uuid::Uuid; /// The env var used to specify where the integ tests should point pub(crate) const INTEG_SERVER_TARGET_ENV_VAR: &str = "TEMPORAL_SERVICE_ADDRESS"; pub(crate) const INTEG_NAMESPACE_ENV_VAR: &str = "TEMPORAL_NAMESPACE"; @@ -107,7 +107,8 @@ pub(crate) fn integ_worker_config(tq: &str) -> WorkerConfigBuilder { .max_outstanding_workflow_tasks(100_usize) .versioning_strategy(WorkerVersioningStrategy::None { build_id: "test_build_id".to_owned(), - }); + }) + .skip_client_worker_set_check(true); b } @@ -170,8 +171,12 @@ pub(crate) fn init_integ_telem() -> Option<&'static CoreRuntime> { } Some(INTEG_TESTS_RT.get_or_init(|| { let telemetry_options = get_integ_telem_options(); + let runtime_options = RuntimeOptionsBuilder::default() + .telemetry_options(telemetry_options) + .build() + .expect("Runtime options build cleanly"); let rt = - CoreRuntime::new_assume_tokio(telemetry_options).expect("Core runtime inits cleanly"); + CoreRuntime::new_assume_tokio(runtime_options).expect("Core runtime inits cleanly"); if let Some(sub) = rt.telemetry().trace_subscriber() { let _ = tracing::subscriber::set_global_default(sub); } @@ -319,8 +324,7 @@ impl CoreWfStarter { pub(crate) async fn worker(&mut self) -> TestWorker { let w = self.get_worker().await; - let tq = w.get_config().task_queue.clone(); - let mut w = TestWorker::new(w, tq); + let mut w = TestWorker::new(w); w.client = Some(self.get_client().await); w @@ -482,8 +486,11 @@ pub(crate) struct TestWorker { } impl TestWorker { /// Create a new test worker - pub(crate) fn new(core_worker: Arc, task_queue: impl Into) -> Self { - let inner = Worker::new_from_core(core_worker.clone(), task_queue); + pub(crate) fn new(core_worker: Arc) -> Self { + let inner = Worker::new_from_core( + core_worker.clone(), + core_worker.get_config().task_queue.clone(), + ); Self { inner, core_worker, @@ -497,6 +504,10 @@ impl TestWorker { &mut self.inner } + pub(crate) fn worker_instance_key(&self) -> Uuid { + self.core_worker.worker_instance_key() + } + // TODO: Maybe trait-ify? pub(crate) fn register_wf>( &mut self, @@ -811,6 +822,13 @@ pub(crate) fn get_integ_telem_options() -> TelemetryOptions { .unwrap() } +pub(crate) fn get_integ_runtime_options(telemopts: TelemetryOptions) -> RuntimeOptions { + RuntimeOptionsBuilder::default() + .telemetry_options(telemopts) + .build() + .unwrap() +} + #[async_trait::async_trait(?Send)] pub(crate) trait WorkflowHandleExt { async fn fetch_history_and_replay( @@ -936,10 +954,7 @@ pub(crate) fn mock_sdk_cfg( let mut mock = build_mock_pollers(poll_cfg); mock.worker_cfg(mutator); let core = mock_worker(mock); - TestWorker::new( - Arc::new(core), - temporal_sdk_core::test_help::TEST_Q.to_string(), - ) + TestWorker::new(Arc::new(core)) } #[derive(Default)] @@ -1014,6 +1029,10 @@ pub(crate) fn integ_dev_server_config( "system.enableDeploymentVersions=true".to_owned(), "--dynamic-config-value".to_owned(), "component.nexusoperations.recordCancelRequestCompletionEvents=true".to_owned(), + "--dynamic-config-value".to_owned(), + "frontend.WorkerHeartbeatsEnabled=true".to_owned(), + "--dynamic-config-value".to_owned(), + "frontend.ListWorkersEnabled=true".to_owned(), "--search-attribute".to_string(), format!("{SEARCH_ATTR_TXT}=Text"), "--search-attribute".to_string(), diff --git a/tests/global_metric_tests.rs b/tests/global_metric_tests.rs index 14e799195..822bf238c 100644 --- a/tests/global_metric_tests.rs +++ b/tests/global_metric_tests.rs @@ -2,6 +2,7 @@ #[allow(dead_code)] mod common; +use crate::common::get_integ_runtime_options; use common::CoreWfStarter; use parking_lot::Mutex; use std::{sync::Arc, time::Duration}; @@ -71,18 +72,16 @@ async fn otel_errors_logged_as_errors() { .unwrap(), ) .unwrap(); + let telemopts = TelemetryOptionsBuilder::default() + .metrics(Arc::new(exporter) as Arc) + // Importantly, _not_ using subscriber override, is using console. + .logging(Logger::Console { + filter: construct_filter_string(Level::INFO, Level::WARN), + }) + .build() + .unwrap(); - let rt = CoreRuntime::new_assume_tokio( - TelemetryOptionsBuilder::default() - .metrics(Arc::new(exporter) as Arc) - // Importantly, _not_ using subscriber override, is using console. - .logging(Logger::Console { - filter: construct_filter_string(Level::INFO, Level::WARN), - }) - .build() - .unwrap(), - ) - .unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let mut starter = CoreWfStarter::new_with_runtime("otel_errors_logged_as_errors", rt); let _worker = starter.get_worker().await; diff --git a/tests/heavy_tests.rs b/tests/heavy_tests.rs index f5dc9018d..64bb298fb 100644 --- a/tests/heavy_tests.rs +++ b/tests/heavy_tests.rs @@ -2,6 +2,7 @@ #[allow(dead_code)] mod common; +use crate::common::get_integ_runtime_options; use common::{ CoreWfStarter, init_integ_telem, prom_metrics, rand_6_chars, workflows::la_problem_workflow, }; @@ -194,7 +195,7 @@ async fn workflow_load() { // cause us to encounter the tracing span drop bug telemopts.logging = None; init_integ_telem(); - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let mut starter = CoreWfStarter::new_with_runtime("workflow_load", rt); starter .worker_config diff --git a/tests/integ_tests/metrics_tests.rs b/tests/integ_tests/metrics_tests.rs index 3e0b2ad16..dddd3fe3c 100644 --- a/tests/integ_tests/metrics_tests.rs +++ b/tests/integ_tests/metrics_tests.rs @@ -1,3 +1,4 @@ +use crate::common::get_integ_runtime_options; use crate::{ common::{ ANY_PORT, CoreWfStarter, NAMESPACE, OTEL_URL_ENV_VAR, PROMETHEUS_QUERY_API, @@ -98,7 +99,7 @@ async fn prometheus_metrics_exported( }); } let (telemopts, addr, _aborter) = prom_metrics(Some(opts_builder.build().unwrap())); - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let opts = get_integ_server_options(); let mut raw_client = opts .connect_no_namespace(rt.telemetry().get_temporal_metric_meter()) @@ -149,7 +150,7 @@ async fn prometheus_metrics_exported( async fn one_slot_worker_reports_available_slot() { let (telemopts, addr, _aborter) = prom_metrics(None); let tq = "one_slot_worker_tq"; - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let worker_cfg = WorkerConfigBuilder::default() .namespace(NAMESPACE) @@ -402,7 +403,7 @@ async fn query_of_closed_workflow_doesnt_tick_terminal_metric( completion: workflow_command::Variant, ) { let (telemopts, addr, _aborter) = prom_metrics(None); - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let mut starter = CoreWfStarter::new_with_runtime("query_of_closed_workflow_doesnt_tick_terminal_metric", rt); // Disable cache to ensure replay happens completely @@ -524,8 +525,11 @@ async fn query_of_closed_workflow_doesnt_tick_terminal_metric( #[test] fn runtime_new() { - let mut rt = - CoreRuntime::new(get_integ_telem_options(), TokioRuntimeBuilder::default()).unwrap(); + let mut rt = CoreRuntime::new( + get_integ_runtime_options(get_integ_telem_options()), + TokioRuntimeBuilder::default(), + ) + .unwrap(); let handle = rt.tokio_handle(); let _rt = handle.enter(); let (telemopts, addr, _aborter) = prom_metrics(None); @@ -571,7 +575,7 @@ async fn latency_metrics( .build() .unwrap(), )); - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let mut starter = CoreWfStarter::new_with_runtime("latency_metrics", rt); let worker = starter.get_worker().await; starter.start_wf().await; @@ -625,7 +629,7 @@ async fn latency_metrics( #[tokio::test] async fn request_fail_codes() { let (telemopts, addr, _aborter) = prom_metrics(None); - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let opts = get_integ_server_options(); let mut client = opts .connect(NAMESPACE, rt.telemetry().get_temporal_metric_meter()) @@ -671,8 +675,8 @@ async fn request_fail_codes_otel() { let mut telemopts = TelemetryOptionsBuilder::default(); let exporter = Arc::new(exporter); telemopts.metrics(exporter as Arc); - - let rt = CoreRuntime::new_assume_tokio(telemopts.build().unwrap()).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts.build().unwrap())) + .unwrap(); let opts = get_integ_server_options(); let mut client = opts .connect(NAMESPACE, rt.telemetry().get_temporal_metric_meter()) @@ -725,7 +729,7 @@ async fn docker_metrics_with_prometheus( .metric_prefix(test_uid.clone()) .build() .unwrap(); - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let test_name = "docker_metrics_with_prometheus"; let mut starter = CoreWfStarter::new_with_runtime(test_name, rt); let worker = starter.get_worker().await; @@ -779,7 +783,7 @@ async fn docker_metrics_with_prometheus( #[tokio::test] async fn activity_metrics() { let (telemopts, addr, _aborter) = prom_metrics(None); - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let wf_name = "activity_metrics"; let mut starter = CoreWfStarter::new_with_runtime(wf_name, rt); starter @@ -913,7 +917,7 @@ async fn activity_metrics() { #[tokio::test] async fn nexus_metrics() { let (telemopts, addr, _aborter) = prom_metrics(None); - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let wf_name = "nexus_metrics"; let mut starter = CoreWfStarter::new_with_runtime(wf_name, rt); starter.worker_config.no_remote_activities(true); @@ -1090,7 +1094,7 @@ async fn nexus_metrics() { #[tokio::test] async fn evict_on_complete_does_not_count_as_forced_eviction() { let (telemopts, addr, _aborter) = prom_metrics(None); - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let wf_name = "evict_on_complete_does_not_count_as_forced_eviction"; let mut starter = CoreWfStarter::new_with_runtime(wf_name, rt); starter.worker_config.no_remote_activities(true); @@ -1173,7 +1177,7 @@ where #[tokio::test] async fn metrics_available_from_custom_slot_supplier() { let (telemopts, addr, _aborter) = prom_metrics(None); - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let mut starter = CoreWfStarter::new_with_runtime("metrics_available_from_custom_slot_supplier", rt); starter.worker_config.no_remote_activities(true); @@ -1340,7 +1344,7 @@ async fn sticky_queue_label_strategy( .unwrap(), )); telemopts.task_queue_label_strategy = strategy; - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let wf_name = format!("sticky_queue_label_strategy_{strategy:?}"); let mut starter = CoreWfStarter::new_with_runtime(&wf_name, rt); // Enable sticky queues by setting a reasonable cache size @@ -1404,8 +1408,7 @@ async fn sticky_queue_label_strategy( if l.contains("task_queue=") && l.contains("WorkflowTask") { assert!( l.contains(&format!("task_queue=\"{task_queue}\"")), - "With UseNormal, all workflow task_queue labels should use normal name. Found: {}", - l + "With UseNormal, all workflow task_queue labels should use normal name. Found: {l}", ); } } diff --git a/tests/integ_tests/polling_tests.rs b/tests/integ_tests/polling_tests.rs index 1c5219eb0..e502cc9b4 100644 --- a/tests/integ_tests/polling_tests.rs +++ b/tests/integ_tests/polling_tests.rs @@ -18,7 +18,7 @@ use std::{ use temporal_client::{WfClientExt, WorkflowClientTrait, WorkflowOptions}; use temporal_sdk::{ActivityOptions, WfContext}; use temporal_sdk_core::{ - ClientOptionsBuilder, CoreRuntime, + ClientOptionsBuilder, CoreRuntime, RuntimeOptionsBuilder, ephemeral_server::{TemporalDevServerConfigBuilder, default_cached_download}, init_worker, telemetry::CoreLogStreamConsumer, @@ -221,7 +221,9 @@ async fn switching_worker_client_changes_poll() { // Swap client, poll for next task, confirm it's second wf, and respond w/ empty info!("Replacing client and polling again"); - worker.replace_client(client2.get_client().inner().clone()); + worker + .replace_client(client2.get_client().inner().clone()) + .unwrap(); let act2 = worker.poll_workflow_activation().await.unwrap(); assert_eq!(wf2.run_id, act2.run_id); worker.complete_execution(&act2.run_id).await; @@ -339,7 +341,11 @@ async fn replace_client_works_after_polling_failure() { }) .build() .unwrap(); - let rt = Arc::new(CoreRuntime::new_assume_tokio(telem_opts).unwrap()); + let runtime_opts = RuntimeOptionsBuilder::default() + .telemetry_options(telem_opts) + .build() + .unwrap(); + let rt = Arc::new(CoreRuntime::new_assume_tokio(runtime_opts).unwrap()); // Spawning background task to read logs and notify the test when polling failure occurs. let look_for_poll_failure_log = Arc::new(AtomicBool::new(false)); @@ -477,7 +483,7 @@ async fn replace_client_works_after_polling_failure() { // Switch worker over to the main integration server. // The polling started on the initial server should complete with a task from the new server. - worker.replace_client(client_for_integ_server); + worker.replace_client(client_for_integ_server).unwrap(); let act_2 = tokio::time::timeout(Duration::from_secs(60), poll_join_handle) .await .unwrap() diff --git a/tests/integ_tests/worker_heartbeat_tests.rs b/tests/integ_tests/worker_heartbeat_tests.rs new file mode 100644 index 000000000..c79ff03f3 --- /dev/null +++ b/tests/integ_tests/worker_heartbeat_tests.rs @@ -0,0 +1,1050 @@ +use crate::common::{ANY_PORT, CoreWfStarter, eventually, get_integ_telem_options}; +use anyhow::anyhow; +use crossbeam_utils::atomic::AtomicCell; +use futures_util::StreamExt; +use prost_types::Duration as PbDuration; +use prost_types::Timestamp; +use std::collections::HashSet; +use std::env; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use temporal_client::{ + Client, NamespacedClient, RetryClient, WfClientExt, WorkflowClientTrait, WorkflowService, +}; +use temporal_sdk::{ActContext, ActivityOptions, WfContext}; +use temporal_sdk_core::telemetry::{build_otlp_metric_exporter, start_prometheus_metric_exporter}; +use temporal_sdk_core::{ + CoreRuntime, ResourceBasedTuner, ResourceSlotOptions, RuntimeOptionsBuilder, +}; +use temporal_sdk_core_api::telemetry::{ + OtelCollectorOptionsBuilder, PrometheusExporterOptionsBuilder, TelemetryOptionsBuilder, +}; +use temporal_sdk_core_api::worker::PollerBehavior; +use temporal_sdk_core_protos::coresdk::{AsJsonPayloadExt, FromJsonPayloadExt}; +use temporal_sdk_core_protos::prost_dur; +use temporal_sdk_core_protos::temporal::api::common::v1::RetryPolicy; +use temporal_sdk_core_protos::temporal::api::enums::v1::WorkerStatus; +use temporal_sdk_core_protos::temporal::api::worker::v1::{PluginInfo, WorkerHeartbeat}; +use temporal_sdk_core_protos::temporal::api::workflowservice::v1::DescribeWorkerRequest; +use temporal_sdk_core_protos::temporal::api::workflowservice::v1::ListWorkersRequest; +use tokio::sync::Notify; +use tokio::time::sleep; +use tonic::IntoRequest; +use url::Url; + +fn within_two_minutes_ts(ts: Timestamp) -> bool { + let ts_time = UNIX_EPOCH + Duration::new(ts.seconds as u64, ts.nanos as u32); + + let now = SystemTime::now(); + // ts should be at most 2 minutes before the current time + now.duration_since(ts_time).unwrap() <= Duration::from_secs(2 * 60) +} + +fn within_duration(dur: PbDuration, threshold: Duration) -> bool { + let std_dur = Duration::new(dur.seconds as u64, dur.nanos as u32); + std_dur <= threshold +} + +fn new_no_metrics_starter(wf_name: &str) -> CoreWfStarter { + let runtimeopts = RuntimeOptionsBuilder::default() + .telemetry_options(TelemetryOptionsBuilder::default().build().unwrap()) + .heartbeat_interval(Some(Duration::from_millis(100))) + .build() + .unwrap(); + CoreWfStarter::new_with_runtime(wf_name, CoreRuntime::new_assume_tokio(runtimeopts).unwrap()) +} + +fn to_system_time(ts: Timestamp) -> SystemTime { + UNIX_EPOCH + Duration::new(ts.seconds as u64, ts.nanos as u32) +} + +async fn list_worker_heartbeats( + client: &Arc>, + query: impl Into, +) -> Vec { + let mut raw_client = client.as_ref().clone(); + WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 200, + next_page_token: Vec::new(), + query: query.into(), + } + .into_request(), + ) + .await + .unwrap() + .into_inner() + .workers_info + .into_iter() + .filter_map(|info| info.worker_heartbeat) + .collect() +} + +// Tests that rely on Prometheus running in a docker container need to start +// with `docker_` and set the `DOCKER_PROMETHEUS_RUNNING` env variable to run +#[rstest::rstest] +#[tokio::test] +async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] backing: &str) { + if env::var("DOCKER_PROMETHEUS_RUNNING").is_err() { + return; + } + let telemopts = if backing == "no_metrics" { + TelemetryOptionsBuilder::default().build().unwrap() + } else { + get_integ_telem_options() + }; + let runtimeopts = RuntimeOptionsBuilder::default() + .telemetry_options(telemopts) + .heartbeat_interval(Some(Duration::from_millis(100))) + .build() + .unwrap(); + let mut rt = CoreRuntime::new_assume_tokio(runtimeopts).unwrap(); + match backing { + "otel" => { + let url = Some("grpc://localhost:4317") + .map(|x| x.parse::().unwrap()) + .unwrap(); + let mut opts_build = OtelCollectorOptionsBuilder::default(); + let opts = opts_build.url(url).build().unwrap(); + rt.telemetry_mut() + .attach_late_init_metrics(Arc::new(build_otlp_metric_exporter(opts).unwrap())); + } + "prom" => { + let mut opts_build = PrometheusExporterOptionsBuilder::default(); + opts_build.socket_addr(ANY_PORT.parse().unwrap()); + let opts = opts_build.build().unwrap(); + rt.telemetry_mut() + .attach_late_init_metrics(start_prometheus_metric_exporter(opts).unwrap().meter); + } + "no_metrics" => {} + _ => unreachable!(), + } + let wf_name = format!("worker_heartbeat_basic_{backing}"); + let mut starter = CoreWfStarter::new_with_runtime(&wf_name, rt); + starter + .worker_config + .max_outstanding_workflow_tasks(5_usize) + .max_cached_workflows(5_usize) + .max_outstanding_activities(5_usize) + .plugins(vec![ + PluginInfo { + name: "plugin1".to_string(), + version: "1".to_string(), + }, + PluginInfo { + name: "plugin2".to_string(), + version: "2".to_string(), + }, + ]); + let mut worker = starter.worker().await; + let worker_instance_key = worker.worker_instance_key(); + + worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { + ctx.activity(ActivityOptions { + activity_type: "pass_fail_act".to_string(), + input: "pass".as_json_payload().expect("serializes fine"), + start_to_close_timeout: Some(Duration::from_secs(1)), + ..Default::default() + }) + .await; + Ok(().into()) + }); + + let acts_started = Arc::new(Notify::new()); + let acts_done = Arc::new(Notify::new()); + + let acts_started_act = acts_started.clone(); + let acts_done_act = acts_done.clone(); + worker.register_activity("pass_fail_act", move |_ctx: ActContext, i: String| { + let acts_started = acts_started_act.clone(); + let acts_done = acts_done_act.clone(); + async move { + acts_started.notify_one(); + acts_done.notified().await; + Ok(i) + } + }); + + starter + .start_with_worker(wf_name.clone(), &mut worker) + .await; + + let start_time = AtomicCell::new(None); + let heartbeat_time = AtomicCell::new(None); + + let test_fut = async { + // Give enough time to ensure heartbeat interval has been hit + tokio::time::sleep(Duration::from_millis(110)).await; + acts_started.notified().await; + let client = starter.get_client().await; + let mut raw_client = (*client).clone(); + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + } + .into_request(), + ) + .await + .unwrap() + .into_inner(); + let worker_info = workers_list + .workers_info + .iter() + .find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }) + .unwrap(); + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + assert_eq!( + heartbeat.worker_instance_key, + worker_instance_key.to_string() + ); + in_activity_checks(heartbeat, &start_time, &heartbeat_time); + acts_done.notify_one(); + }; + + let runner = async move { + worker.run_until_done().await.unwrap(); + }; + tokio::join!(test_fut, runner); + + let client = starter.get_client().await; + let mut raw_client = (*client).clone(); + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + } + .into_request(), + ) + .await + .unwrap() + .into_inner(); + // Since list_workers finds all workers in the namespace, must find specific worker used in this + // test + let worker_info = workers_list + .workers_info + .iter() + .find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }) + .unwrap(); + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + after_shutdown_checks(heartbeat, &wf_name, &start_time, &heartbeat_time); +} + +// Tests that rely on Prometheus running in a docker container need to start +// with `docker_` and set the `DOCKER_PROMETHEUS_RUNNING` env variable to run +#[tokio::test] +async fn docker_worker_heartbeat_tuner() { + if env::var("DOCKER_PROMETHEUS_RUNNING").is_err() { + return; + } + let runtimeopts = RuntimeOptionsBuilder::default() + .telemetry_options(get_integ_telem_options()) + .heartbeat_interval(Some(Duration::from_millis(100))) + .build() + .unwrap(); + let mut rt = CoreRuntime::new_assume_tokio(runtimeopts).unwrap(); + + let url = Some("grpc://localhost:4317") + .map(|x| x.parse::().unwrap()) + .unwrap(); + let mut opts_build = OtelCollectorOptionsBuilder::default(); + let opts = opts_build.url(url).build().unwrap(); + + rt.telemetry_mut() + .attach_late_init_metrics(Arc::new(build_otlp_metric_exporter(opts).unwrap())); + let wf_name = "worker_heartbeat_tuner"; + let mut starter = CoreWfStarter::new_with_runtime(wf_name, rt); + let mut tuner = ResourceBasedTuner::new(0.0, 0.0); + tuner + .with_workflow_slots_options(ResourceSlotOptions::new(2, 10, Duration::from_millis(0))) + .with_activity_slots_options(ResourceSlotOptions::new(5, 10, Duration::from_millis(50))); + starter + .worker_config + .workflow_task_poller_behavior(PollerBehavior::Autoscaling { + minimum: 1, + maximum: 200, + initial: 5, + }) + .nexus_task_poller_behavior(PollerBehavior::Autoscaling { + minimum: 1, + maximum: 200, + initial: 5, + }) + .clear_max_outstanding_opts() + .tuner(Arc::new(tuner)); + let mut worker = starter.worker().await; + let worker_instance_key = worker.worker_instance_key(); + + // Run a workflow + worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { + ctx.activity(ActivityOptions { + activity_type: "pass_fail_act".to_string(), + input: "pass".as_json_payload().expect("serializes fine"), + start_to_close_timeout: Some(Duration::from_secs(1)), + ..Default::default() + }) + .await; + Ok(().into()) + }); + worker.register_activity("pass_fail_act", |_ctx: ActContext, i: String| async move { + Ok(i) + }); + + starter.start_with_worker(wf_name, &mut worker).await; + worker.run_until_done().await.unwrap(); + + let client = starter.get_client().await; + let mut raw_client = (*client).clone(); + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + } + .into_request(), + ) + .await + .unwrap() + .into_inner(); + // Since list_workers finds all workers in the namespace, must find specific worker used in this + // test + let worker_info = workers_list + .workers_info + .iter() + .find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }) + .unwrap(); + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + assert!(heartbeat.task_queue.starts_with(wf_name)); + + assert_eq!( + heartbeat + .workflow_task_slots_info + .clone() + .unwrap() + .slot_supplier_kind, + "ResourceBased" + ); + assert_eq!( + heartbeat + .activity_task_slots_info + .clone() + .unwrap() + .slot_supplier_kind, + "ResourceBased" + ); + assert_eq!( + heartbeat + .nexus_task_slots_info + .clone() + .unwrap() + .slot_supplier_kind, + "ResourceBased" + ); + assert_eq!( + heartbeat + .local_activity_slots_info + .clone() + .unwrap() + .slot_supplier_kind, + "ResourceBased" + ); + + let workflow_poller_info = heartbeat.workflow_poller_info.unwrap(); + assert!(workflow_poller_info.is_autoscaling); + assert!(within_two_minutes_ts( + workflow_poller_info.last_successful_poll_time.unwrap() + )); + let sticky_poller_info = heartbeat.workflow_sticky_poller_info.unwrap(); + assert!(sticky_poller_info.is_autoscaling); + assert!(within_two_minutes_ts( + sticky_poller_info.last_successful_poll_time.unwrap() + )); + let nexus_poller_info = heartbeat.nexus_poller_info.unwrap(); + assert!(nexus_poller_info.is_autoscaling); + assert!(nexus_poller_info.last_successful_poll_time.is_none()); + let activity_poller_info = heartbeat.activity_poller_info.unwrap(); + assert!(!activity_poller_info.is_autoscaling); + assert!(within_two_minutes_ts( + activity_poller_info.last_successful_poll_time.unwrap() + )); +} + +fn in_activity_checks( + heartbeat: &WorkerHeartbeat, + start_time: &AtomicCell>, + heartbeat_time: &AtomicCell>, +) { + assert_eq!(heartbeat.status, WorkerStatus::Running as i32); + + let workflow_task_slots = heartbeat.workflow_task_slots_info.clone().unwrap(); + assert_eq!(workflow_task_slots.total_processed_tasks, 1); + assert_eq!(workflow_task_slots.current_available_slots, 5); + assert_eq!(workflow_task_slots.current_used_slots, 0); + assert_eq!(workflow_task_slots.slot_supplier_kind, "Fixed"); + let activity_task_slots = heartbeat.activity_task_slots_info.clone().unwrap(); + assert_eq!(activity_task_slots.current_available_slots, 4); + assert_eq!(activity_task_slots.current_used_slots, 1); + assert_eq!(activity_task_slots.slot_supplier_kind, "Fixed"); + let nexus_task_slots = heartbeat.nexus_task_slots_info.clone().unwrap(); + assert_eq!(nexus_task_slots.current_available_slots, 0); + assert_eq!(nexus_task_slots.current_used_slots, 0); + assert_eq!(nexus_task_slots.slot_supplier_kind, "Fixed"); + let local_activity_task_slots = heartbeat.local_activity_slots_info.clone().unwrap(); + assert_eq!(local_activity_task_slots.current_available_slots, 100); + assert_eq!(local_activity_task_slots.current_used_slots, 0); + assert_eq!(local_activity_task_slots.slot_supplier_kind, "Fixed"); + + let workflow_poller_info = heartbeat.workflow_poller_info.unwrap(); + assert_eq!(workflow_poller_info.current_pollers, 1); + let sticky_poller_info = heartbeat.workflow_sticky_poller_info.unwrap(); + assert_ne!(sticky_poller_info.current_pollers, 0); + let nexus_poller_info = heartbeat.nexus_poller_info.unwrap(); + assert_eq!(nexus_poller_info.current_pollers, 0); + let activity_poller_info = heartbeat.activity_poller_info.unwrap(); + assert_ne!(activity_poller_info.current_pollers, 0); + assert_ne!(heartbeat.current_sticky_cache_size, 0); + start_time.store(Some(heartbeat.start_time.unwrap())); + heartbeat_time.store(Some(heartbeat.heartbeat_time.unwrap())); +} + +fn after_shutdown_checks( + heartbeat: &WorkerHeartbeat, + wf_name: &str, + start_time: &AtomicCell>, + heartbeat_time: &AtomicCell>, +) { + assert_eq!(heartbeat.worker_identity, "integ_tester"); + let host_info = heartbeat.host_info.clone().unwrap(); + assert!(!host_info.host_name.is_empty()); + assert!(!host_info.process_key.is_empty()); + assert!(!host_info.process_id.is_empty()); + assert_ne!(host_info.current_host_cpu_usage, 0.0); + assert_ne!(host_info.current_host_mem_usage, 0.0); + + assert!(heartbeat.task_queue.starts_with(wf_name)); + assert_eq!( + heartbeat.deployment_version.clone().unwrap().build_id, + "test_build_id" + ); + assert_eq!(heartbeat.sdk_name, "temporal-core"); + assert_eq!(heartbeat.sdk_version, "0.1.0"); + assert_eq!(heartbeat.status, WorkerStatus::Shutdown as i32); + + assert_eq!(start_time.load().unwrap(), heartbeat.start_time.unwrap()); + assert_ne!( + heartbeat_time.load().unwrap(), + heartbeat.heartbeat_time.unwrap() + ); + assert!(within_two_minutes_ts(heartbeat.start_time.unwrap())); + assert!(within_two_minutes_ts(heartbeat.heartbeat_time.unwrap())); + assert!( + to_system_time(heartbeat_time.load().unwrap()) + < to_system_time(heartbeat.heartbeat_time.unwrap()) + ); + assert!(within_duration( + heartbeat.elapsed_since_last_heartbeat.unwrap(), + Duration::from_millis(200) + )); + + let workflow_task_slots = heartbeat.workflow_task_slots_info.clone().unwrap(); + assert_eq!(workflow_task_slots.current_available_slots, 5); + assert_eq!(workflow_task_slots.current_used_slots, 1); + assert_eq!(workflow_task_slots.total_processed_tasks, 2); + assert_eq!(workflow_task_slots.slot_supplier_kind, "Fixed"); + let activity_task_slots = heartbeat.activity_task_slots_info.clone().unwrap(); + assert_eq!(activity_task_slots.current_available_slots, 5); + assert_eq!(workflow_task_slots.current_used_slots, 1); + assert_eq!(activity_task_slots.slot_supplier_kind, "Fixed"); + assert_eq!(activity_task_slots.last_interval_processed_tasks, 1); + let nexus_task_slots = heartbeat.nexus_task_slots_info.clone().unwrap(); + assert_eq!(nexus_task_slots.current_available_slots, 0); + assert_eq!(nexus_task_slots.current_used_slots, 0); + assert_eq!(nexus_task_slots.slot_supplier_kind, "Fixed"); + let local_activity_task_slots = heartbeat.local_activity_slots_info.clone().unwrap(); + assert_eq!(local_activity_task_slots.current_available_slots, 100); + assert_eq!(local_activity_task_slots.current_used_slots, 0); + assert_eq!(local_activity_task_slots.slot_supplier_kind, "Fixed"); + + let workflow_poller_info = heartbeat.workflow_poller_info.unwrap(); + assert!(!workflow_poller_info.is_autoscaling); + assert!(within_two_minutes_ts( + workflow_poller_info.last_successful_poll_time.unwrap() + )); + let sticky_poller_info = heartbeat.workflow_sticky_poller_info.unwrap(); + assert!(!sticky_poller_info.is_autoscaling); + assert!(within_two_minutes_ts( + sticky_poller_info.last_successful_poll_time.unwrap() + )); + let nexus_poller_info = heartbeat.nexus_poller_info.unwrap(); + assert!(!nexus_poller_info.is_autoscaling); + assert!(nexus_poller_info.last_successful_poll_time.is_none()); + let activity_poller_info = heartbeat.activity_poller_info.unwrap(); + assert!(!activity_poller_info.is_autoscaling); + assert!(within_two_minutes_ts( + activity_poller_info.last_successful_poll_time.unwrap() + )); + + assert_eq!(heartbeat.total_sticky_cache_hit, 2); + assert_eq!(heartbeat.current_sticky_cache_size, 0); + assert_eq!( + heartbeat.plugins, + vec![ + PluginInfo { + name: "plugin1".to_string(), + version: "1".to_string() + }, + PluginInfo { + name: "plugin2".to_string(), + version: "2".to_string() + } + ] + ); +} + +#[tokio::test] +async fn worker_heartbeat_sticky_cache_miss() { + let wf_name = "worker_heartbeat_cache_miss"; + let mut starter = new_no_metrics_starter(wf_name); + starter + .worker_config + .max_cached_workflows(1_usize) + .max_outstanding_workflow_tasks(2_usize); + + let mut worker = starter.worker().await; + worker.fetch_results = false; + let worker_key = worker.worker_instance_key().to_string(); + let worker_core = worker.core_worker.clone(); + let submitter = worker.get_submitter_handle(); + let wf_opts = starter.workflow_options.clone(); + let client = starter.get_client().await; + let client_for_orchestrator = client.clone(); + + static HISTORY_WF1_ACTIVITY_STARTED: Notify = Notify::const_new(); + static HISTORY_WF1_ACTIVITY_FINISH: Notify = Notify::const_new(); + static HISTORY_WF2_ACTIVITY_STARTED: Notify = Notify::const_new(); + static HISTORY_WF2_ACTIVITY_FINISH: Notify = Notify::const_new(); + + worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { + let wf_marker = ctx + .get_args() + .first() + .and_then(|p| String::from_json_payload(p).ok()) + .unwrap_or_else(|| "wf1".to_string()); + + ctx.activity(ActivityOptions { + activity_type: "sticky_cache_history_act".to_string(), + input: wf_marker.clone().as_json_payload().expect("serialize"), + start_to_close_timeout: Some(Duration::from_secs(5)), + ..Default::default() + }) + .await; + + Ok(().into()) + }); + worker.register_activity( + "sticky_cache_history_act", + |_ctx: ActContext, marker: String| async move { + match marker.as_str() { + "wf1" => { + HISTORY_WF1_ACTIVITY_STARTED.notify_one(); + HISTORY_WF1_ACTIVITY_FINISH.notified().await; + } + "wf2" => { + HISTORY_WF2_ACTIVITY_STARTED.notify_one(); + HISTORY_WF2_ACTIVITY_FINISH.notified().await; + } + _ => {} + } + Ok(marker) + }, + ); + + let wf1_id = format!("{wf_name}_wf1"); + let wf2_id = format!("{wf_name}_wf2"); + + let orchestrator = async move { + let wf1_run = submitter + .submit_wf( + wf1_id.clone(), + wf_name.to_string(), + vec!["wf1".to_string().as_json_payload().unwrap()], + wf_opts.clone(), + ) + .await + .unwrap(); + + HISTORY_WF1_ACTIVITY_STARTED.notified().await; + + let wf2_run = submitter + .submit_wf( + wf2_id.clone(), + wf_name.to_string(), + vec!["wf2".to_string().as_json_payload().unwrap()], + wf_opts, + ) + .await + .unwrap(); + + HISTORY_WF2_ACTIVITY_STARTED.notified().await; + + HISTORY_WF1_ACTIVITY_FINISH.notify_one(); + let handle1 = client_for_orchestrator.get_untyped_workflow_handle(wf1_id, wf1_run); + handle1 + .get_workflow_result(Default::default()) + .await + .expect("wf1 result"); + + HISTORY_WF2_ACTIVITY_FINISH.notify_one(); + let handle2 = client_for_orchestrator.get_untyped_workflow_handle(wf2_id, wf2_run); + handle2 + .get_workflow_result(Default::default()) + .await + .expect("wf2 result"); + + worker_core.initiate_shutdown(); + }; + + let mut worker_runner = worker; + let runner = async move { + worker_runner.run_until_done().await.unwrap(); + }; + + tokio::join!(orchestrator, runner); + + sleep(Duration::from_millis(200)).await; + let mut heartbeats = + list_worker_heartbeats(&client, format!("WorkerInstanceKey=\"{worker_key}\"")).await; + assert_eq!(heartbeats.len(), 1); + let heartbeat = heartbeats.pop().unwrap(); + + assert!(heartbeat.total_sticky_cache_miss >= 1); + assert_eq!(heartbeat.worker_instance_key, worker_key); +} + +#[tokio::test] +async fn worker_heartbeat_multiple_workers() { + let wf_name = "worker_heartbeat_multi_workers"; + let mut starter = new_no_metrics_starter(wf_name); + starter + .worker_config + .max_outstanding_workflow_tasks(5_usize) + .max_cached_workflows(5_usize); + + let client = starter.get_client().await; + let starting_hb_len = list_worker_heartbeats(&client, String::new()).await.len(); + + let mut worker_a = starter.worker().await; + worker_a.register_wf(wf_name.to_string(), |_ctx: WfContext| async move { + Ok(().into()) + }); + worker_a.register_activity("failing_act", |_ctx: ActContext, _: String| async move { + Ok(()) + }); + + let mut starter_b = starter.clone_no_worker(); + let mut worker_b = starter_b.worker().await; + worker_b.register_wf(wf_name.to_string(), |_ctx: WfContext| async move { + Ok(().into()) + }); + worker_b.register_activity("failing_act", |_ctx: ActContext, _: String| async move { + Ok(()) + }); + + let worker_a_key = worker_a.worker_instance_key().to_string(); + let worker_b_key = worker_b.worker_instance_key().to_string(); + let _ = starter.start_with_worker(wf_name, &mut worker_a).await; + worker_a.run_until_done().await.unwrap(); + + let _ = starter_b.start_with_worker(wf_name, &mut worker_b).await; + worker_b.run_until_done().await.unwrap(); + + sleep(Duration::from_millis(200)).await; + + let all = list_worker_heartbeats(&client, String::new()).await; + let keys: HashSet<_> = all + .iter() + .map(|hb| hb.worker_instance_key.clone()) + .collect(); + assert!(keys.contains(&worker_a_key)); + assert!(keys.contains(&worker_b_key)); + + // Verify both heartbeats contain the same shared process_key + let process_keys: HashSet<_> = all + .iter() + .filter_map(|hb| hb.host_info.as_ref().map(|info| info.process_key.clone())) + .collect(); + assert!(process_keys.len() > starting_hb_len); + + let filtered = + list_worker_heartbeats(&client, format!("WorkerInstanceKey=\"{worker_a_key}\"")).await; + assert_eq!(filtered.len(), 1); + assert_eq!(filtered[0].worker_instance_key, worker_a_key); + + // Verify describe worker gives the same heartbeat as listworker + let mut raw_client = client.as_ref().clone(); + let describe_worker_a = WorkflowService::describe_worker( + &mut raw_client, + DescribeWorkerRequest { + namespace: client.namespace().to_owned(), + worker_instance_key: worker_a_key.to_string(), + } + .into_request(), + ) + .await + .unwrap() + .into_inner() + .worker_info + .unwrap() + .worker_heartbeat + .unwrap(); + assert_eq!(describe_worker_a, filtered[0]); + + let filtered_b = + list_worker_heartbeats(&client, format!("WorkerInstanceKey = \"{worker_b_key}\"")).await; + assert_eq!(filtered_b.len(), 1); + assert_eq!(filtered_b[0].worker_instance_key, worker_b_key); + let describe_worker_b = WorkflowService::describe_worker( + &mut raw_client, + DescribeWorkerRequest { + namespace: client.namespace().to_owned(), + worker_instance_key: worker_b_key.to_string(), + } + .into_request(), + ) + .await + .unwrap() + .into_inner() + .worker_info + .unwrap() + .worker_heartbeat + .unwrap(); + assert_eq!(describe_worker_b, filtered_b[0]); +} + +#[tokio::test] +async fn worker_heartbeat_failure_metrics() { + const WORKFLOW_CONTINUE_SIGNAL: &str = "workflow-continue"; + + let wf_name = "worker_heartbeat_failure_metrics"; + let mut starter = new_no_metrics_starter(wf_name); + starter.worker_config.max_outstanding_activities(5_usize); + + let mut worker = starter.worker().await; + let worker_instance_key = worker.worker_instance_key(); + static ACT_COUNT: AtomicU64 = AtomicU64::new(0); + static WF_COUNT: AtomicU64 = AtomicU64::new(0); + static ACT_FAIL: Notify = Notify::const_new(); + static WF_FAIL: Notify = Notify::const_new(); + worker.register_wf(wf_name.to_string(), |ctx: WfContext| async move { + let _ = ctx + .activity(ActivityOptions { + activity_type: "failing_act".to_string(), + input: "boom".as_json_payload().expect("serialize"), + start_to_close_timeout: Some(Duration::from_secs(1)), + retry_policy: Some(RetryPolicy { + initial_interval: Some(prost_dur!(from_millis(10))), + backoff_coefficient: 1.0, + maximum_attempts: 4, + ..Default::default() + }), + ..Default::default() + }) + .await; + + if WF_COUNT.load(Ordering::Relaxed) == 0 { + WF_COUNT.fetch_add(1, Ordering::Relaxed); + WF_FAIL.notify_one(); + panic!("expected WF panic"); + } + + // Signal here to avoid workflow from completing and shutdown heartbeat from sending + // before we check workflow_slots.last_interval_failure_tasks + let mut proceed_signal = ctx.make_signal_channel(WORKFLOW_CONTINUE_SIGNAL); + proceed_signal.next().await.unwrap(); + Ok(().into()) + }); + + worker.register_activity("failing_act", |_ctx: ActContext, _: String| async move { + if ACT_COUNT.load(Ordering::Relaxed) == 3 { + return Ok(()); + } + ACT_COUNT.fetch_add(1, Ordering::Relaxed); + ACT_FAIL.notify_one(); + Err(anyhow!("Expected error").into()) + }); + + let worker_key = worker_instance_key.to_string(); + starter.workflow_options.retry_policy = Some(RetryPolicy { + maximum_attempts: 2, + ..Default::default() + }); + + let _ = starter.start_with_worker(wf_name, &mut worker).await; + + let test_fut = async { + ACT_FAIL.notified().await; + let client = starter.get_client().await; + eventually( + || async { + let mut raw_client = (*client).clone(); + + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + } + .into_request(), + ) + .await + .unwrap() + .into_inner(); + let worker_info = workers_list + .workers_info + .iter() + .find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }) + .unwrap(); + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + assert_eq!( + heartbeat.worker_instance_key, + worker_instance_key.to_string() + ); + let activity_slots = heartbeat.activity_task_slots_info.clone().unwrap(); + if activity_slots.last_interval_failure_tasks >= 1 { + return Ok(()); + } + Err("activity_slots.last_interval_failure_tasks still 0, retrying") + }, + Duration::from_millis(150), + ) + .await + .unwrap(); + + WF_FAIL.notified().await; + + eventually( + || async { + let mut raw_client = (*client).clone(); + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + } + .into_request(), + ) + .await + .unwrap() + .into_inner(); + let worker_info = workers_list + .workers_info + .iter() + .find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }) + .unwrap(); + + let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap(); + let workflow_slots = heartbeat.workflow_task_slots_info.clone().unwrap(); + if workflow_slots.last_interval_failure_tasks >= 1 { + return Ok(()); + } + Err("workflow_slots.last_interval_failure_tasks still 0, retrying") + }, + Duration::from_millis(150), + ) + .await + .unwrap(); + client + .signal_workflow_execution( + starter.get_wf_id().to_string(), + String::new(), + WORKFLOW_CONTINUE_SIGNAL.to_string(), + None, + None, + ) + .await + .unwrap(); + }; + + let runner = async move { + worker.run_until_done().await.unwrap(); + }; + tokio::join!(test_fut, runner); + + let client = starter.get_client().await; + let mut heartbeats = + list_worker_heartbeats(&client, format!("WorkerInstanceKey=\"{worker_key}\"")).await; + assert_eq!(heartbeats.len(), 1); + let heartbeat = heartbeats.pop().unwrap(); + + let activity_slots = heartbeat.activity_task_slots_info.unwrap(); + assert_eq!(activity_slots.total_failed_tasks, 3); + + let workflow_slots = heartbeat.workflow_task_slots_info.unwrap(); + assert_eq!(workflow_slots.total_failed_tasks, 1); +} + +#[tokio::test] +async fn worker_heartbeat_no_runtime_heartbeat() { + let wf_name = "worker_heartbeat_no_runtime_heartbeat"; + let runtimeopts = RuntimeOptionsBuilder::default() + .telemetry_options(get_integ_telem_options()) + .heartbeat_interval(None) // Turn heartbeating off + .build() + .unwrap(); + let rt = CoreRuntime::new_assume_tokio(runtimeopts).unwrap(); + let mut starter = CoreWfStarter::new_with_runtime(wf_name, rt); + let mut worker = starter.worker().await; + let worker_instance_key = worker.worker_instance_key(); + + worker.register_wf(wf_name.to_owned(), |ctx: WfContext| async move { + ctx.activity(ActivityOptions { + activity_type: "pass_fail_act".to_string(), + input: "pass".as_json_payload().expect("serializes fine"), + start_to_close_timeout: Some(Duration::from_secs(1)), + ..Default::default() + }) + .await; + Ok(().into()) + }); + + worker.register_activity("pass_fail_act", |_ctx: ActContext, i: String| async move { + Ok(i) + }); + + starter + .start_with_worker(wf_name.to_owned(), &mut worker) + .await; + + worker.run_until_done().await.unwrap(); + let client = starter.get_client().await; + let mut raw_client = (*client).clone(); + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + } + .into_request(), + ) + .await + .unwrap() + .into_inner(); + + // Ensure worker has not ever heartbeated + let heartbeat = workers_list.workers_info.iter().find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }); + assert!(heartbeat.is_none()); +} + +#[tokio::test] +async fn worker_heartbeat_skip_client_worker_set_check() { + let wf_name = "worker_heartbeat_skip_client_worker_set_check"; + let runtimeopts = RuntimeOptionsBuilder::default() + .telemetry_options(get_integ_telem_options()) + .heartbeat_interval(Some(Duration::from_millis(100))) + .build() + .unwrap(); + let rt = CoreRuntime::new_assume_tokio(runtimeopts).unwrap(); + let mut starter = CoreWfStarter::new_with_runtime(wf_name, rt); + starter.worker_config.skip_client_worker_set_check(true); + let mut worker = starter.worker().await; + let worker_instance_key = worker.worker_instance_key(); + + worker.register_wf(wf_name.to_owned(), |ctx: WfContext| async move { + ctx.activity(ActivityOptions { + activity_type: "pass_fail_act".to_string(), + input: "pass".as_json_payload().expect("serializes fine"), + start_to_close_timeout: Some(Duration::from_secs(1)), + ..Default::default() + }) + .await; + Ok(().into()) + }); + + worker.register_activity("pass_fail_act", |_ctx: ActContext, i: String| async move { + Ok(i) + }); + + starter + .start_with_worker(wf_name.to_owned(), &mut worker) + .await; + + worker.run_until_done().await.unwrap(); + let client = starter.get_client().await; + let mut raw_client = (*client).clone(); + let workers_list = WorkflowService::list_workers( + &mut raw_client, + ListWorkersRequest { + namespace: client.namespace().to_owned(), + page_size: 100, + next_page_token: Vec::new(), + query: String::new(), + } + .into_request(), + ) + .await + .unwrap() + .into_inner(); + + // Ensure worker still heartbeats + let heartbeat = workers_list.workers_info.iter().find(|worker_info| { + if let Some(hb) = worker_info.worker_heartbeat.as_ref() { + hb.worker_instance_key == worker_instance_key.to_string() + } else { + false + } + }); + assert!(heartbeat.is_some()); +} diff --git a/tests/integ_tests/worker_tests.rs b/tests/integ_tests/worker_tests.rs index 38c7223da..2a0ed3c79 100644 --- a/tests/integ_tests/worker_tests.rs +++ b/tests/integ_tests/worker_tests.rs @@ -1,3 +1,4 @@ +use crate::common::get_integ_runtime_options; use crate::{ common::{CoreWfStarter, get_integ_server_options, get_integ_telem_options, mock_sdk_cfg}, shared_tests, @@ -19,8 +20,8 @@ use temporal_sdk::{ use temporal_sdk_core::{ CoreRuntime, ResourceBasedTuner, ResourceSlotOptions, TunerBuilder, init_worker, test_help::{ - FakeWfResponses, MockPollCfg, ResponseType, TEST_Q, build_mock_pollers, - drain_pollers_and_shutdown, hist_to_poll_resp, mock_worker, mock_worker_client, + FakeWfResponses, MockPollCfg, ResponseType, build_mock_pollers, drain_pollers_and_shutdown, + hist_to_poll_resp, mock_worker, mock_worker_client, }, }; use temporal_sdk_core_api::{ @@ -67,7 +68,9 @@ use uuid::Uuid; #[tokio::test] async fn worker_validation_fails_on_nonexistent_namespace() { let opts = get_integ_server_options(); - let runtime = CoreRuntime::new_assume_tokio(get_integ_telem_options()).unwrap(); + let runtime = + CoreRuntime::new_assume_tokio(get_integ_runtime_options(get_integ_telem_options())) + .unwrap(); let retrying_client = opts .connect_no_namespace(runtime.telemetry().get_temporal_metric_meter()) .await @@ -324,7 +327,7 @@ async fn activity_tasks_from_completion_reserve_slots() { cfg.max_outstanding_activities = Some(2); }); let core = Arc::new(mock_worker(mock)); - let mut worker = crate::common::TestWorker::new(core.clone(), TEST_Q.to_string()); + let mut worker = crate::common::TestWorker::new(core.clone()); // First poll for activities twice, occupying both slots let at1 = core.poll_activity_task().await.unwrap(); diff --git a/tests/integ_tests/workflow_tests.rs b/tests/integ_tests/workflow_tests.rs index 6d10fbb71..263c942b0 100644 --- a/tests/integ_tests/workflow_tests.rs +++ b/tests/integ_tests/workflow_tests.rs @@ -18,6 +18,7 @@ mod stickyness; mod timers; mod upsert_search_attrs; +use crate::common::get_integ_runtime_options; use crate::{ common::{ CoreWfStarter, history_from_proto_binary, init_core_and_create_wf, @@ -67,7 +68,6 @@ use temporal_sdk_core_protos::{ test_utils::schedule_activity_cmd, }; use tokio::{join, sync::Notify, time::sleep}; - // TODO: We should get expected histories for these tests and confirm that the history at the end // matches. @@ -764,7 +764,7 @@ async fn nondeterminism_errors_fail_workflow_when_configured_to( #[values(true, false)] whole_worker: bool, ) { let (telemopts, addr, _aborter) = prom_metrics(None); - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let wf_name = "nondeterminism_errors_fail_workflow_when_configured_to"; let mut starter = CoreWfStarter::new_with_runtime(wf_name, rt); starter.worker_config.no_remote_activities(true); diff --git a/tests/main.rs b/tests/main.rs index 8828d4899..06eb96972 100644 --- a/tests/main.rs +++ b/tests/main.rs @@ -22,12 +22,14 @@ mod integ_tests { mod queries_tests; mod update_tests; mod visibility_tests; + mod worker_heartbeat_tests; mod worker_tests; mod worker_versioning_tests; mod workflow_tests; use crate::common::{ - CoreWfStarter, get_integ_server_options, get_integ_telem_options, rand_6_chars, + CoreWfStarter, get_integ_runtime_options, get_integ_server_options, + get_integ_telem_options, rand_6_chars, }; use std::time::Duration; use temporal_client::{NamespacedClient, WorkflowService}; @@ -45,7 +47,9 @@ mod integ_tests { #[ignore] // Really a compile time check more than anything async fn lang_bridge_example() { let opts = get_integ_server_options(); - let runtime = CoreRuntime::new_assume_tokio(get_integ_telem_options()).unwrap(); + let runtime = + CoreRuntime::new_assume_tokio(get_integ_runtime_options(get_integ_telem_options())) + .unwrap(); let mut retrying_client = opts .connect_no_namespace(runtime.telemetry().get_temporal_metric_meter()) .await diff --git a/tests/manual_tests.rs b/tests/manual_tests.rs index 8f5ef4c5b..9588be3f3 100644 --- a/tests/manual_tests.rs +++ b/tests/manual_tests.rs @@ -5,6 +5,7 @@ #[allow(dead_code)] mod common; +use crate::common::get_integ_runtime_options; use common::{CoreWfStarter, prom_metrics, rand_6_chars}; use futures_util::{ StreamExt, @@ -41,7 +42,7 @@ async fn poller_load_spiky() { } else { prom_metrics(None) }; - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let mut starter = CoreWfStarter::new_with_runtime("poller_load", rt); starter .worker_config @@ -200,7 +201,7 @@ async fn poller_load_sustained() { } else { prom_metrics(None) }; - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let mut starter = CoreWfStarter::new_with_runtime("poller_load", rt); starter .worker_config @@ -291,7 +292,7 @@ async fn poller_load_spike_then_sustained() { } else { prom_metrics(None) }; - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let mut starter = CoreWfStarter::new_with_runtime("poller_load", rt); starter .worker_config diff --git a/tests/workflow_replay_bench.rs b/tests/workflow_replay_bench.rs index 4200ebd3a..d80796b0a 100644 --- a/tests/workflow_replay_bench.rs +++ b/tests/workflow_replay_bench.rs @@ -5,7 +5,9 @@ #[allow(dead_code)] mod common; -use crate::common::{DONT_AUTO_INIT_INTEG_TELEM, prom_metrics, replay_sdk_worker}; +use crate::common::{ + DONT_AUTO_INIT_INTEG_TELEM, get_integ_runtime_options, prom_metrics, replay_sdk_worker, +}; use criterion::{BatchSize, Criterion, criterion_group, criterion_main}; use futures_util::StreamExt; use std::{ @@ -80,7 +82,7 @@ pub fn bench_metrics(c: &mut Criterion) { let _tokio = tokio_runtime.enter(); let (mut telemopts, _addr, _aborter) = prom_metrics(None); telemopts.logging = None; - let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap(); + let rt = CoreRuntime::new_assume_tokio(get_integ_runtime_options(telemopts)).unwrap(); let meter = rt.telemetry().get_metric_meter().unwrap(); c.bench_function("Record with new attributes on each call", move |b| {