Skip to content

Commit 9732f77

Browse files
authored
Emit estimated matching task lag metric (#2605)
1 parent 3a3057e commit 9732f77

File tree

8 files changed

+35
-8
lines changed

8 files changed

+35
-8
lines changed

common/metrics/defs.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2131,6 +2131,7 @@ const (
21312131
TaskQueueStoppedCounter
21322132
TaskWriteThrottlePerTaskQueueCounter
21332133
TaskWriteLatencyPerTaskQueue
2134+
TaskLagPerTaskQueueGauge
21342135

21352136
NumMatchingMetrics
21362137
)
@@ -2588,6 +2589,7 @@ var MetricDefs = map[ServiceIdx]map[int]metricDefinition{
25882589
TaskQueueStoppedCounter: NewCounterDef("task_queue_stopped"),
25892590
TaskWriteThrottlePerTaskQueueCounter: NewRollupCounterDef("task_write_throttle_count_per_tl", "task_write_throttle_count"),
25902591
TaskWriteLatencyPerTaskQueue: NewRollupTimerDef("task_write_latency_per_tl", "task_write_latency"),
2592+
TaskLagPerTaskQueueGauge: NewGaugeDef("task_lag_per_tl"),
25912593
},
25922594
Worker: {
25932595
ReplicatorMessages: NewCounterDef("replicator_messages"),

common/metrics/tags.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"strconv"
3030
"strings"
3131

32+
"go.temporal.io/api/enums/v1"
3233
enumspb "go.temporal.io/api/enums/v1"
3334
)
3435

@@ -135,6 +136,10 @@ func TaskQueueTag(value string) Tag {
135136
return &tagImpl{key: taskQueue, value: sanitizer.Value(value)}
136137
}
137138

139+
func TaskQueueTypeTag(tqType enums.TaskQueueType) Tag {
140+
return &tagImpl{key: TaskTypeTagName, value: tqType.String()}
141+
}
142+
138143
// WorkflowTypeTag returns a new workflow type tag.
139144
func WorkflowTypeTag(value string) Tag {
140145
if len(value) == 0 {

common/metrics/temporal_queues.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,5 +42,8 @@ func GetPerTaskQueueScope(
4242
metricTaskQueueName = unknownValue
4343
}
4444

45-
return baseScope.Tagged(NamespaceTag(namespaceName), TaskQueueTag(metricTaskQueueName))
45+
return baseScope.Tagged(
46+
NamespaceTag(namespaceName),
47+
TaskQueueTag(metricTaskQueueName),
48+
)
4649
}

service/history/historyEngine.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1376,8 +1376,12 @@ func (e *historyEngineImpl) RecordActivityTaskStarted(
13761376
namespaceName := namespaceEntry.Name()
13771377
taskQueueName := ai.GetTaskQueue()
13781378

1379-
metrics.GetPerTaskQueueScope(metricsScope, namespaceName.String(), taskQueueName, enumspb.TASK_QUEUE_KIND_NORMAL).
1380-
Tagged(metrics.TaskTypeTag("activity")).
1379+
metrics.GetPerTaskQueueScope(
1380+
metricsScope,
1381+
namespaceName.String(),
1382+
taskQueueName,
1383+
enumspb.TASK_QUEUE_KIND_NORMAL,
1384+
).Tagged(metrics.TaskQueueTypeTag(enumspb.TASK_QUEUE_TYPE_ACTIVITY)).
13811385
RecordTimer(metrics.TaskScheduleToStartLatency, scheduleToStartLatency)
13821386

13831387
response.StartedTime = ai.StartedTime

service/history/workflowTaskHandlerCallbacks.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,8 +234,12 @@ func (handler *workflowTaskHandlerCallbacksImpl) handleWorkflowTaskStarted(
234234
workflowScheduleToStartLatency := workflowTask.StartedTime.Sub(*workflowTask.ScheduledTime)
235235
namespaceName := namespaceEntry.Name()
236236
taskQueue := workflowTask.TaskQueue
237-
metrics.GetPerTaskQueueScope(metricsScope, namespaceName.String(), taskQueue.GetName(), taskQueue.GetKind()).
238-
Tagged(metrics.TaskTypeTag("workflow")).
237+
metrics.GetPerTaskQueueScope(
238+
metricsScope,
239+
namespaceName.String(),
240+
taskQueue.GetName(),
241+
taskQueue.GetKind(),
242+
).Tagged(metrics.TaskQueueTypeTag(enumspb.TASK_QUEUE_TYPE_WORKFLOW)).
239243
RecordTimer(metrics.TaskScheduleToStartLatency, workflowScheduleToStartLatency)
240244

241245
resp, err = handler.createRecordWorkflowTaskStartedResponse(mutableState, workflowTask, req.PollRequest.GetIdentity())

service/matching/matchingEngine_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -844,7 +844,7 @@ func (s *matchingEngineSuite) TestSyncMatchActivities() {
844844
}
845845

846846
time.Sleep(20 * time.Millisecond) // So any buffer tasks from 0 rps get picked up
847-
syncCtr := scope.Snapshot().Counters()["test.sync_throttle_count_per_tl+namespace="+matchingTestNamespace+",operation=TaskQueueMgr,service_name=matching,taskqueue=makeToast"]
847+
syncCtr := scope.Snapshot().Counters()["test.sync_throttle_count_per_tl+namespace="+matchingTestNamespace+",operation=TaskQueueMgr,service_name=matching,task_type=Activity,taskqueue=makeToast"]
848848
s.Equal(1, int(syncCtr.Value())) // Check times zero rps is set = throttle counter
849849
s.EqualValues(1, s.taskManager.getCreateTaskCount(tlID)) // Check times zero rps is set = Tasks stored in persistence
850850
s.EqualValues(0, s.taskManager.getTaskCount(tlID))

service/matching/taskQueueManager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ func newTaskQueueManager(
174174
nsName.String(),
175175
taskQueue.name,
176176
taskQueueKind,
177-
)
177+
).Tagged(metrics.TaskQueueTypeTag(taskQueue.taskType))
178178
tlMgr := &taskQueueManagerImpl{
179179
status: common.DaemonStatusInitialized,
180180
namespaceRegistry: e.namespaceRegistry,

service/matching/taskReader.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,9 @@ func (tr *taskReader) addSingleTaskToBuffer(
268268
}
269269

270270
func (tr *taskReader) persistAckLevel() error {
271-
return tr.tlMgr.db.UpdateState(tr.tlMgr.taskAckManager.getAckLevel())
271+
ackLevel := tr.tlMgr.taskAckManager.getAckLevel()
272+
tr.emitTaskLagMetric(ackLevel)
273+
return tr.tlMgr.db.UpdateState(ackLevel)
272274
}
273275

274276
func (tr *taskReader) isTaskAddedRecently(lastAddTime time.Time) bool {
@@ -282,3 +284,10 @@ func (tr *taskReader) logger() log.Logger {
282284
func (tr *taskReader) scope() metrics.Scope {
283285
return tr.tlMgr.metricScope
284286
}
287+
288+
func (tr *taskReader) emitTaskLagMetric(ackLevel int64) {
289+
// note: this metric is only an estimation for the lag.
290+
// taskID in DB may not be continuous, especially when task list ownership changes.
291+
maxReadLevel := tr.tlMgr.taskWriter.GetMaxReadLevel()
292+
tr.scope().UpdateGauge(metrics.TaskLagPerTaskQueueGauge, float64(maxReadLevel-ackLevel))
293+
}

0 commit comments

Comments
 (0)