Skip to content

Commit 11a30b2

Browse files
authored
Per shard per namespace RPS warning log (#4525)
* add warning log for high per shard per ns rps
1 parent 39bf6a8 commit 11a30b2

File tree

6 files changed

+124
-99
lines changed

6 files changed

+124
-99
lines changed

common/dynamicconfig/constants.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,9 @@ const (
116116
PersistenceHealthSignalBufferSize = "system.persistenceHealthSignalBufferSize"
117117
// ShardRPSWarnLimit is the per-shard RPS limit for warning
118118
ShardRPSWarnLimit = "system.shardRPSWarnLimit"
119+
// ShardPerNsRPSWarnPercent is the per-shard per-namespace RPS limit for warning as a percentage of ShardRPSWarnLimit
120+
// these warning are not emitted if the value is set to 0 or less
121+
ShardPerNsRPSWarnPercent = "system.shardPerNsRPSWarnPercent"
119122

120123
// Whether the deadlock detector should dump goroutines
121124
DeadlockDumpGoroutines = "system.deadlock.DumpGoroutines"

common/log/tag/tags.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,11 @@ func RequestCount(c int) ZapTag {
477477
return NewInt("request-count", c)
478478
}
479479

480+
// RPS returns tag for requests per second
481+
func RPS(c int64) ZapTag {
482+
return NewInt64("rps", c)
483+
}
484+
480485
// Number returns tag for Number
481486
func Number(n int64) ZapTag {
482487
return NewInt64("number", n)

common/persistence/client/fx.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ func FactoryProvider(
116116
func HealthSignalAggregatorProvider(
117117
dynamicCollection *dynamicconfig.Collection,
118118
metricsHandler metrics.Handler,
119-
logger log.Logger,
119+
logger log.ThrottledLogger,
120120
) persistence.HealthSignalAggregator {
121121
if dynamicCollection.GetBoolProperty(dynamicconfig.PersistenceHealthSignalMetricsEnabled, true)() {
122122
return persistence.NewHealthSignalAggregatorImpl(
@@ -125,6 +125,7 @@ func HealthSignalAggregatorProvider(
125125
dynamicCollection.GetIntProperty(dynamicconfig.PersistenceHealthSignalBufferSize, 5000)(),
126126
metricsHandler,
127127
dynamicCollection.GetIntProperty(dynamicconfig.ShardRPSWarnLimit, 50),
128+
dynamicCollection.GetFloat64Property(dynamicconfig.ShardPerNsRPSWarnPercent, 0.8),
128129
logger,
129130
)
130131
}

common/persistence/health_signal_aggregator.go

Lines changed: 39 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ const (
4444
type (
4545
HealthSignalAggregator interface {
4646
common.Daemon
47-
Record(callerSegment int32, latency time.Duration, err error)
47+
Record(callerSegment int32, namespace string, latency time.Duration, err error)
4848
AverageLatency() float64
4949
ErrorRatio() float64
5050
}
@@ -53,16 +53,18 @@ type (
5353
status int32
5454
shutdownCh chan struct{}
5555

56-
requestsPerShard map[int32]int64
57-
requestsLock sync.Mutex
56+
// map of shardID -> map of namespace -> request count
57+
requestCounts map[int32]map[string]int64
58+
requestsLock sync.Mutex
5859

5960
aggregationEnabled bool
6061
latencyAverage aggregate.MovingWindowAverage
6162
errorRatio aggregate.MovingWindowAverage
6263

63-
metricsHandler metrics.Handler
64-
emitMetricsTimer *time.Ticker
65-
perShardRPSWarnLimit dynamicconfig.IntPropertyFn
64+
metricsHandler metrics.Handler
65+
emitMetricsTimer *time.Ticker
66+
perShardRPSWarnLimit dynamicconfig.IntPropertyFn
67+
perShardPerNsRPSWarnLimit dynamicconfig.FloatPropertyFn
6668

6769
logger log.Logger
6870
}
@@ -74,17 +76,19 @@ func NewHealthSignalAggregatorImpl(
7476
maxBufferSize int,
7577
metricsHandler metrics.Handler,
7678
perShardRPSWarnLimit dynamicconfig.IntPropertyFn,
79+
perShardPerNsRPSWarnLimit dynamicconfig.FloatPropertyFn,
7780
logger log.Logger,
7881
) *HealthSignalAggregatorImpl {
7982
ret := &HealthSignalAggregatorImpl{
80-
status: common.DaemonStatusInitialized,
81-
shutdownCh: make(chan struct{}),
82-
requestsPerShard: make(map[int32]int64),
83-
metricsHandler: metricsHandler,
84-
emitMetricsTimer: time.NewTicker(emitMetricsInterval),
85-
perShardRPSWarnLimit: perShardRPSWarnLimit,
86-
logger: logger,
87-
aggregationEnabled: aggregationEnabled,
83+
status: common.DaemonStatusInitialized,
84+
shutdownCh: make(chan struct{}),
85+
requestCounts: make(map[int32]map[string]int64),
86+
metricsHandler: metricsHandler,
87+
emitMetricsTimer: time.NewTicker(emitMetricsInterval),
88+
perShardRPSWarnLimit: perShardRPSWarnLimit,
89+
perShardPerNsRPSWarnLimit: perShardPerNsRPSWarnLimit,
90+
logger: logger,
91+
aggregationEnabled: aggregationEnabled,
8892
}
8993

9094
if aggregationEnabled {
@@ -113,7 +117,7 @@ func (s *HealthSignalAggregatorImpl) Stop() {
113117
s.emitMetricsTimer.Stop()
114118
}
115119

116-
func (s *HealthSignalAggregatorImpl) Record(callerSegment int32, latency time.Duration, err error) {
120+
func (s *HealthSignalAggregatorImpl) Record(callerSegment int32, namespace string, latency time.Duration, err error) {
117121
if s.aggregationEnabled {
118122
s.latencyAverage.Record(latency.Milliseconds())
119123

@@ -125,7 +129,7 @@ func (s *HealthSignalAggregatorImpl) Record(callerSegment int32, latency time.Du
125129
}
126130

127131
if callerSegment != CallerSegmentMissing {
128-
s.incrementShardRequestCount(callerSegment)
132+
s.incrementShardRequestCount(callerSegment, namespace)
129133
}
130134
}
131135

@@ -137,10 +141,13 @@ func (s *HealthSignalAggregatorImpl) ErrorRatio() float64 {
137141
return s.errorRatio.Average()
138142
}
139143

140-
func (s *HealthSignalAggregatorImpl) incrementShardRequestCount(shardID int32) {
144+
func (s *HealthSignalAggregatorImpl) incrementShardRequestCount(shardID int32, namespace string) {
141145
s.requestsLock.Lock()
142146
defer s.requestsLock.Unlock()
143-
s.requestsPerShard[shardID]++
147+
if s.requestCounts[shardID] == nil {
148+
s.requestCounts[shardID] = make(map[string]int64)
149+
}
150+
s.requestCounts[shardID][namespace]++
144151
}
145152

146153
func (s *HealthSignalAggregatorImpl) emitMetricsLoop() {
@@ -150,15 +157,24 @@ func (s *HealthSignalAggregatorImpl) emitMetricsLoop() {
150157
return
151158
case <-s.emitMetricsTimer.C:
152159
s.requestsLock.Lock()
153-
requestCounts := s.requestsPerShard
154-
s.requestsPerShard = make(map[int32]int64, len(requestCounts))
160+
requestCounts := s.requestCounts
161+
s.requestCounts = make(map[int32]map[string]int64, len(requestCounts))
155162
s.requestsLock.Unlock()
156163

157-
for shardID, count := range requestCounts {
158-
shardRPS := int64(float64(count) / emitMetricsInterval.Seconds())
164+
for shardID, requestCountPerNS := range requestCounts {
165+
shardRequestCount := int64(0)
166+
for namespace, count := range requestCountPerNS {
167+
shardRequestCount += count
168+
shardRPSPerNS := int64(float64(count) / emitMetricsInterval.Seconds())
169+
if s.perShardPerNsRPSWarnLimit() > 0.0 && shardRPSPerNS > int64(s.perShardPerNsRPSWarnLimit()*float64(s.perShardRPSWarnLimit())) {
170+
s.logger.Warn("Per shard per namespace RPS warn limit exceeded", tag.ShardID(shardID), tag.WorkflowNamespace(namespace), tag.RPS(shardRPSPerNS))
171+
}
172+
}
173+
174+
shardRPS := int64(float64(shardRequestCount) / emitMetricsInterval.Seconds())
159175
s.metricsHandler.Histogram(metrics.PersistenceShardRPS.GetMetricName(), metrics.PersistenceShardRPS.GetMetricUnit()).Record(shardRPS)
160176
if shardRPS > int64(s.perShardRPSWarnLimit()) {
161-
s.logger.Warn("Per shard RPS warn limit exceeded", tag.ShardID(shardID))
177+
s.logger.Warn("Per shard RPS warn limit exceeded", tag.ShardID(shardID), tag.RPS(shardRPS))
162178
}
163179
}
164180
}

common/persistence/noop_health_signal_aggregator.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ func (a *noopSignalAggregator) Start() {}
4040

4141
func (a *noopSignalAggregator) Stop() {}
4242

43-
func (a *noopSignalAggregator) Record(_ int32, _ time.Duration, _ error) {}
43+
func (a *noopSignalAggregator) Record(_ int32, _ string, _ time.Duration, _ error) {}
4444

4545
func (a *noopSignalAggregator) AverageLatency() float64 {
4646
return 0

0 commit comments

Comments
 (0)