Adding replication task processing metrics (#3452)

yux0 · alexshtin · commit 4e14c2d6c51d · 2020-09-25T18:54:08.000-07:00
diff --git a/common/metrics/defs.go b/common/metrics/defs.go
@@ -1801,6 +1801,7 @@ const (
 	ReplicationTasksLag
 	ReplicationTasksFetched
 	ReplicationTasksReturned
+	ReplicationTasksAppliedLatency
 	ReplicationDLQFailed
 	ReplicationDLQMaxLevelGauge
 	ReplicationDLQAckLevelGauge
@@ -2216,6 +2217,7 @@ var MetricDefs = map[ServiceIdx]map[int]metricDefinition{
 		ReplicationTasksLag:                               {metricName: "replication_tasks_lag", metricType: Timer},
 		ReplicationTasksFetched:                           {metricName: "replication_tasks_fetched", metricType: Timer},
 		ReplicationTasksReturned:                          {metricName: "replication_tasks_returned", metricType: Timer},
+		ReplicationTasksAppliedLatency:                    {metricName: "replication_tasks_applied_latency", metricType: Timer},
 		ReplicationDLQFailed:                              {metricName: "replication_dlq_enqueue_failed", metricType: Counter},
 		ReplicationDLQMaxLevelGauge:                       {metricName: "replication_dlq_max_level", metricType: Gauge},
 		ReplicationDLQAckLevelGauge:                       {metricName: "replication_dlq_ack_level", metricType: Gauge},
diff --git a/service/history/replicationTaskExecutor.go b/service/history/replicationTaskExecutor.go
@@ -119,6 +119,8 @@ func (e *replicationTaskExecutorImpl) handleActivityTask(
 		return err
 	}
 
+	replicationStopWatch := e.metricsClient.StartTimer(metrics.SyncActivityTaskScope, metrics.ServiceLatency)
+	defer replicationStopWatch.Stop()
 	request := &historyservice.SyncActivityRequest{
 		NamespaceId:        attr.NamespaceId,
 		WorkflowId:         attr.WorkflowId,
@@ -177,6 +179,8 @@ func (e *replicationTaskExecutorImpl) handleHistoryReplicationTaskV2(
 		return err
 	}
 
+	replicationStopWatch := e.metricsClient.StartTimer(metrics.HistoryReplicationV2TaskScope, metrics.ServiceLatency)
+	defer replicationStopWatch.Stop()
 	request := &historyservice.ReplicateEventsV2Request{
 		NamespaceId: attr.NamespaceId,
 		WorkflowExecution: &commonpb.WorkflowExecution{
@@ -197,8 +201,8 @@ func (e *replicationTaskExecutorImpl) handleHistoryReplicationTaskV2(
 		return err
 	}
 	e.metricsClient.IncCounter(metrics.HistoryRereplicationByHistoryReplicationScope, metrics.ClientRequests)
-	stopwatch := e.metricsClient.StartTimer(metrics.HistoryRereplicationByHistoryReplicationScope, metrics.ClientLatency)
-	defer stopwatch.Stop()
+	resendStopWatch := e.metricsClient.StartTimer(metrics.HistoryRereplicationByHistoryReplicationScope, metrics.ClientLatency)
+	defer resendStopWatch.Stop()
 
 	if resendErr := e.nDCHistoryResender.SendSingleWorkflowHistory(
 		retryErr.NamespaceId,
diff --git a/service/history/replicationTaskProcessor.go b/service/history/replicationTaskProcessor.go
@@ -281,14 +281,8 @@ func (p *ReplicationTaskProcessorImpl) sendFetchMessageRequest() <-chan *replica
 func (p *ReplicationTaskProcessorImpl) processResponse(response *replicationspb.ReplicationMessages) {
 
 	p.syncShardChan <- response.GetSyncShardStatus()
-	// Note here we check replication tasks instead of hasMore. The expectation is that in a steady state
-	// we will receive replication tasks but hasMore is false (meaning that we are always catching up).
-	// So hasMore might not be a good indicator for additional wait.
-	if len(response.ReplicationTasks) == 0 {
-		backoffDuration := p.noTaskRetrier.NextBackOff()
-		time.Sleep(backoffDuration)
-		return
-	}
+	scope := p.metricsClient.Scope(metrics.ReplicationTaskFetcherScope, metrics.TargetClusterTag(p.sourceCluster))
+	batchRequestStartTime := time.Now()
 
 	for _, replicationTask := range response.ReplicationTasks {
 		err := p.processSingleTask(replicationTask)
@@ -298,9 +292,18 @@ func (p *ReplicationTaskProcessorImpl) processResponse(response *replicationspb.
 		}
 	}
 
+	// Note here we check replication tasks instead of hasMore. The expectation is that in a steady state
+	// we will receive replication tasks but hasMore is false (meaning that we are always catching up).
+	// So hasMore might not be a good indicator for additional wait.
+	if len(response.ReplicationTasks) == 0 {
+		backoffDuration := p.noTaskRetrier.NextBackOff()
+		time.Sleep(backoffDuration)
+	} else {
+		scope.RecordTimer(metrics.ReplicationTasksAppliedLatency, time.Now().Sub(batchRequestStartTime))
+	}
+
 	p.lastProcessedMessageID = response.GetLastRetrievedMessageId()
 	p.lastRetrievedMessageID = response.GetLastRetrievedMessageId()
-	scope := p.metricsClient.Scope(metrics.ReplicationTaskFetcherScope, metrics.TargetClusterTag(p.sourceCluster))
 	scope.UpdateGauge(metrics.LastRetrievedMessageID, float64(p.lastRetrievedMessageID))
 	p.noTaskRetrier.Reset()
 }