@@ -37,6 +37,7 @@ import (
3737 "go.temporal.io/server/common/persistence/serialization"
3838 "go.temporal.io/server/common/persistence/visibility/manager"
3939 "go.temporal.io/server/common/primitives/timestamp"
40+ "go.temporal.io/server/common/rpc/interceptor"
4041 "go.temporal.io/server/common/searchattribute"
4142 serviceerrors "go.temporal.io/server/common/serviceerror"
4243 "go.temporal.io/server/common/tasktoken"
7778 persistenceVisibilityManager manager.VisibilityManager
7879 persistenceHealthSignal persistence.HealthSignalAggregator
7980 healthServer * health.Server
81+ historyHealthSignal interceptor.HealthSignalAggregator
8082 historyServiceResolver membership.ServiceResolver
8183 metricsHandler metrics.Handler
8284 payloadSerializer serialization.Serializer
@@ -106,6 +108,7 @@ type (
106108 PersistenceExecutionManager persistence.ExecutionManager
107109 PersistenceShardManager persistence.ShardManager
108110 PersistenceHealthSignal persistence.HealthSignalAggregator
111+ HistoryHealthSignal interceptor.HealthSignalAggregator
109112 HealthServer * health.Server
110113 PersistenceVisibilityManager manager.VisibilityManager
111114 HistoryServiceResolver membership.ServiceResolver
@@ -206,6 +209,11 @@ func (h *Handler) DeepHealthCheck(
206209 return & historyservice.DeepHealthCheckResponse {State : enumsspb .HEALTH_STATE_DECLINED_SERVING }, nil
207210 }
208211
212+ rsp := h .checkHistoryHealthSignals ()
213+ if rsp != nil {
214+ return rsp , nil
215+ }
216+
209217 latency := h .persistenceHealthSignal .AverageLatency ()
210218 errRatio := h .persistenceHealthSignal .ErrorRatio ()
211219
@@ -217,6 +225,25 @@ func (h *Handler) DeepHealthCheck(
217225 return & historyservice.DeepHealthCheckResponse {State : enumsspb .HEALTH_STATE_SERVING }, nil
218226}
219227
228+ // checkHistoryHealthSignal checks the history health signal that is captured by the interceptor.
229+ func (h * Handler ) checkHistoryHealthSignals () * historyservice.DeepHealthCheckResponse {
230+ // Check that the RPC latency doesn't exceed the threshold.
231+ if _ , ok := h .historyHealthSignal .(* interceptor.NoopSignalAggregator ); ok {
232+ h .logger .Warn ("health signal aggregator is using noop implementation" )
233+ }
234+ if h .historyHealthSignal .AverageLatency () > h .config .HealthRPCLatencyFailure () {
235+ metrics .HistoryHostHealthGauge .With (h .metricsHandler ).Record (float64 (enumsspb .HEALTH_STATE_NOT_SERVING ))
236+ return & historyservice.DeepHealthCheckResponse {State : enumsspb .HEALTH_STATE_NOT_SERVING }
237+ }
238+
239+ // Check if the RPC error ratio exceeds the threshold
240+ if h .historyHealthSignal .ErrorRatio () > h .config .HealthRPCErrorRatio () {
241+ metrics .HistoryHostHealthGauge .With (h .metricsHandler ).Record (float64 (enumsspb .HEALTH_STATE_NOT_SERVING ))
242+ return & historyservice.DeepHealthCheckResponse {State : enumsspb .HEALTH_STATE_NOT_SERVING }
243+ }
244+ return nil
245+ }
246+
220247// IsWorkflowTaskValid - whether workflow task is still valid
221248func (h * Handler ) IsWorkflowTaskValid (ctx context.Context , request * historyservice.IsWorkflowTaskValidRequest ) (_ * historyservice.IsWorkflowTaskValidResponse , retError error ) {
222249 defer log .CapturePanic (h .logger , & retError )
0 commit comments