Rename RunningQueueSize metric and scorer to RunningRequestsSize

BenjaminBraunDev · BenjaminBraunDev · commit 11f4e057e5dc · 2025-12-08T23:16:32.000Z
diff --git a/cmd/epp/runner/runner.go b/cmd/epp/runner/runner.go
@@ -432,7 +432,7 @@ func (r *Runner) registerInTreePlugins() {
 	plugins.Register(profile.SingleProfileHandlerType, profile.SingleProfileHandlerFactory)
 	plugins.Register(scorer.KvCacheUtilizationScorerType, scorer.KvCacheUtilizationScorerFactory)
 	plugins.Register(scorer.QueueScorerType, scorer.QueueScorerFactory)
-	plugins.Register(scorer.RunningQueueSizeScorerType, scorer.RunningQueueSizeScorerFactory)
+	plugins.Register(scorer.RunningRequestsSizeScorerType, scorer.RunningRequestsSizeScorerFactory)
 	plugins.Register(scorer.LoraAffinityScorerType, scorer.LoraAffinityScorerFactory)
 	// Latency predictor plugins
 	plugins.Register(slo_aware_router.SLOAwareRouterPluginType, slo_aware_router.SLOAwareRouterFactory)
diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go
@@ -100,7 +100,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics(
 	if p.MetricMapping.TotalRunningRequests != nil {
 		running, err := p.getMetric(metricFamilies, *p.MetricMapping.TotalRunningRequests)
 		if err == nil {
-			updated.RunningQueueSize = int(running.GetGauge().GetValue())
+			updated.RunningRequestsSize = int(running.GetGauge().GetValue())
 		} else {
 			errs = multierr.Append(errs, err)
 		}
diff --git a/pkg/epp/datalayer/metrics.go b/pkg/epp/datalayer/metrics.go
@@ -28,7 +28,7 @@ type Metrics struct {
 	WaitingModels map[string]int
 	// MaxActiveModels is the maximum number of models that can be loaded to GPU.
 	MaxActiveModels         int
-	RunningQueueSize        int
+	RunningRequestsSize     int
 	WaitingQueueSize        int
 	KVCacheUsagePercent     float64
 	KvCacheMaxTokenCapacity int
@@ -74,7 +74,7 @@ func (m *Metrics) Clone() *Metrics {
 		ActiveModels:            activeModels,
 		WaitingModels:           waitingModels,
 		MaxActiveModels:         m.MaxActiveModels,
-		RunningQueueSize:        m.RunningQueueSize,
+		RunningRequestsSize:     m.RunningRequestsSize,
 		WaitingQueueSize:        m.WaitingQueueSize,
 		KVCacheUsagePercent:     m.KVCacheUsagePercent,
 		KvCacheMaxTokenCapacity: m.KvCacheMaxTokenCapacity,
diff --git a/pkg/epp/datalayer/metrics/extractor.go b/pkg/epp/datalayer/metrics/extractor.go
@@ -56,6 +56,7 @@ type Extractor struct {
 func Produces() map[string]any {
 	return map[string]any{
 		metrics.WaitingQueueSizeKey:    int(0),
+		metrics.RunningRequestsSizeKey: int(0),
 		metrics.KVCacheUsagePercentKey: float64(0),
 		metrics.ActiveModelsKey:        map[string]int{},
 		metrics.WaitingModelsKey:       map[string]int{},
@@ -119,7 +120,7 @@ func (ext *Extractor) Extract(ctx context.Context, data any, ep datalayer.Endpoi
 		if metric, err := spec.getLatestMetric(families); err != nil {
 			errs = append(errs, err)
 		} else {
-			clone.RunningQueueSize = int(extractValue(metric))
+			clone.RunningRequestsSize = int(extractValue(metric))
 			updated = true
 		}
 	}
diff --git a/pkg/epp/datalayer/metrics/logger_test.go b/pkg/epp/datalayer/metrics/logger_test.go
@@ -74,7 +74,7 @@ func TestLogger(t *testing.T) {
 	assert.Contains(t, logOutput, "Refreshing Prometheus Metrics	{\"ReadyPods\": 2}")
 	assert.Contains(t, logOutput, "Current Pods and metrics gathered	{\"Fresh metrics\": \"[Pod: {NamespacedName:default/pod1 PodName: Address:1.2.3.4:5678")
 	assert.Contains(t, logOutput, "Metrics: {ActiveModels:map[modelA:1] WaitingModels:map[modelB:2] MaxActiveModels:5")
-	assert.Contains(t, logOutput, "RunningQueueSize:3 WaitingQueueSize:7 KVCacheUsagePercent:42.5 KvCacheMaxTokenCapacity:2048")
+	assert.Contains(t, logOutput, "RunningRequestsSize:3 WaitingQueueSize:7 KVCacheUsagePercent:42.5 KvCacheMaxTokenCapacity:2048")
 	assert.Contains(t, logOutput, "Pod: {NamespacedName:default/pod2 PodName: Address:1.2.3.4:5679")
 	assert.Contains(t, logOutput, "\"Stale metrics\": \"[]\"")
 }
@@ -106,7 +106,7 @@ func (f *fakeDataStore) PodList(predicate func(datalayer.Endpoint) bool) []datal
 		ActiveModels:            map[string]int{"modelA": 1},
 		WaitingModels:           map[string]int{"modelB": 2},
 		MaxActiveModels:         5,
-		RunningQueueSize:        3,
+		RunningRequestsSize:     3,
 		WaitingQueueSize:        7,
 		KVCacheUsagePercent:     42.5,
 		KvCacheMaxTokenCapacity: 2048,
diff --git a/pkg/epp/datalayer/metrics_test.go b/pkg/epp/datalayer/metrics_test.go
@@ -29,7 +29,7 @@ func TestMetricsClone(t *testing.T) {
 		ActiveModels:            map[string]int{"modelA": 1},
 		WaitingModels:           map[string]int{"modelB": 2},
 		MaxActiveModels:         5,
-		RunningQueueSize:        3,
+		RunningRequestsSize:     3,
 		WaitingQueueSize:        7,
 		KVCacheUsagePercent:     42.5,
 		KvCacheMaxTokenCapacity: 2048,
diff --git a/pkg/epp/metrics/metrics.go b/pkg/epp/metrics/metrics.go
@@ -37,7 +37,7 @@ const (
 
 	KVCacheUsagePercentKey = "KVCacheUsagePercent"
 	WaitingQueueSizeKey    = "WaitingQueueSize"
-	RunningQueueSizeKey    = "RunningQueueSize"
+	RunningRequestsSizeKey = "RunningRequestsSize"
 	MaxActiveModelsKey     = "MaxActiveModels"
 	ActiveModelsKey        = "ActiveModels"
 	WaitingModelsKey       = "WaitingModels"
diff --git a/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/latencypredictor_helper.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/latencypredictor_helper.go
@@ -87,7 +87,7 @@ func processHeaderForLatencyPrediction(
 		KVCachePercentage:  m.KVCacheUsagePercent,
 		InputTokenLength:   len(strings.Fields(sloCtx.schedulingRequest.Body.Completions.Prompt)),
 		NumRequestWaiting:  m.WaitingQueueSize,
-		NumRequestRunning:  m.RunningQueueSize,
+		NumRequestRunning:  m.RunningRequestsSize,
 		NumTokensGenerated: 0,
 		PrefixCacheScore:   prefix_cache_score,
 	}
@@ -174,7 +174,7 @@ func recordTTFTTrainingData(
 		ActualTPOT:         0,
 		Timestamp:          now,
 		NumRequestWaiting:  m.WaitingQueueSize,
-		NumRequestRunning:  m.RunningQueueSize,
+		NumRequestRunning:  m.RunningRequestsSize,
 		NumTokensGenerated: 0,
 		PrefixCacheScore:   prefixCacheScore,
 	}
@@ -201,7 +201,7 @@ func predictFirstTPOT(
 		KVCachePercentage:  m.KVCacheUsagePercent,
 		InputTokenLength:   len(strings.Fields(sloCtx.schedulingRequest.Body.Completions.Prompt)),
 		NumRequestWaiting:  m.WaitingQueueSize,
-		NumRequestRunning:  m.RunningQueueSize,
+		NumRequestRunning:  m.RunningRequestsSize,
 		NumTokensGenerated: sloCtx.generatedTokenCount,
 		PrefixCacheScore:   0,
 	}
@@ -260,7 +260,7 @@ func processTokenForLatencyPrediction(
 		ActualTPOT:         latencyMs,
 		Timestamp:          now,
 		NumRequestWaiting:  m.WaitingQueueSize,
-		NumRequestRunning:  m.RunningQueueSize,
+		NumRequestRunning:  m.RunningRequestsSize,
 		NumTokensGenerated: sloCtx.generatedTokenCount - 1,
 		PrefixCacheScore:   0, // TPOT does not use prefix cache score
 	}
@@ -274,7 +274,7 @@ func processTokenForLatencyPrediction(
 			KVCachePercentage:  m.KVCacheUsagePercent,
 			InputTokenLength:   len(strings.Fields(sloCtx.schedulingRequest.Body.Completions.Prompt)),
 			NumRequestWaiting:  m.WaitingQueueSize,
-			NumRequestRunning:  m.RunningQueueSize,
+			NumRequestRunning:  m.RunningRequestsSize,
 			NumTokensGenerated: sloCtx.generatedTokenCount,
 			PrefixCacheScore:   0, // TPOT does not use prefix cache score
 		}
@@ -337,7 +337,7 @@ func bulkPredictWithMetrics(
 			KVCachePercentage:  metricsStates[i].KVCacheUsagePercent,
 			InputTokenLength:   len(strings.Fields(prompts[i])),
 			NumRequestWaiting:  metricsStates[i].WaitingQueueSize,
-			NumRequestRunning:  metricsStates[i].RunningQueueSize,
+			NumRequestRunning:  metricsStates[i].RunningRequestsSize,
 			NumTokensGenerated: generatedTokenCounts[i],
 			PrefixCacheScore:   prefixCacheScores[i],
 		}
@@ -385,7 +385,7 @@ func bulkPredictWithMetrics(
 				"generated_tokens", bulkRequests[i].NumTokensGenerated,
 				"kv_cache_percent", bulkRequests[i].KVCachePercentage,
 				"waiting_queue", bulkRequests[i].NumRequestWaiting,
-				"running_queue", bulkRequests[i].NumRequestRunning,
+				"running_requests", bulkRequests[i].NumRequestRunning,
 				"prefix_cache_score", bulkRequests[i].PrefixCacheScore)
 		}
 	}
diff --git a/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/requestcontrol_hooks_test.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/requestcontrol_hooks_test.go
@@ -38,15 +38,15 @@ import (
 const (
 	testModelName = "test-model"
 	kvUsage       = 1
-	runningQueue  = 1
+	runningRequests = 1
 	waitingQueue  = 1
 )
 
 // Helper functions
 
 func createTestSchedulingResult(pod *backend.Pod) *schedulingtypes.SchedulingResult {
 
-	mockPod := createTestPod(pod.NamespacedName.Name, kvUsage, runningQueue, waitingQueue)
+	mockPod := createTestPod(pod.NamespacedName.Name, kvUsage, runningRequests, waitingQueue)
 
 	return &schedulingtypes.SchedulingResult{
 		PrimaryProfileName: "default",
@@ -343,12 +343,12 @@ func TestSLOAwareRouter_ResponseStreaming_FirstToken(t *testing.T) {
 	sloCtx.lastSeenMetrics["prefill"] = &backendmetrics.MetricsState{
 		KVCacheUsagePercent: 0.5,
 		WaitingQueueSize:    1,
-		RunningQueueSize:    1,
+		RunningRequestsSize: 1,
 	}
 	sloCtx.lastSeenMetrics["default"] = &backendmetrics.MetricsState{
 		KVCacheUsagePercent: 0.5,
 		WaitingQueueSize:    1,
-		RunningQueueSize:    1,
+		RunningRequestsSize: 1,
 	}
 	router.setSLOContextForRequest(request, sloCtx)
 
@@ -394,12 +394,12 @@ func TestSLOAwareRouter_ResponseStreaming_SubsequentTokens(t *testing.T) {
 	sloCtx.lastSeenMetrics["prefill"] = &backendmetrics.MetricsState{
 		KVCacheUsagePercent: 0.5,
 		WaitingQueueSize:    1,
-		RunningQueueSize:    1,
+		RunningRequestsSize: 1,
 	}
 	sloCtx.lastSeenMetrics["default"] = &backendmetrics.MetricsState{
 		KVCacheUsagePercent: 0.5,
 		WaitingQueueSize:    1,
-		RunningQueueSize:    1,
+		RunningRequestsSize: 1,
 	}
 	firstTokenTime := time.Now().Add(-100 * time.Millisecond)
 
diff --git a/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/scorer_test.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/scorer_test.go
@@ -102,7 +102,7 @@ func (m *mockPredictor) GetServerStatus(ctx context.Context) (*latencypredictor.
 	return &latencypredictor.ServerStatusResponse{}, nil
 }
 
-func createTestPod(name string, kvCacheUsage float64, runningQueueSize, waitingQueueSize int) schedulingtypes.Pod {
+func createTestPod(name string, kvCacheUsage float64, runningRequestsSize, waitingQueueSize int) schedulingtypes.Pod {
 	return &schedulingtypes.PodMetrics{
 		Pod: &backend.Pod{
 			NamespacedName: types.NamespacedName{
@@ -112,7 +112,7 @@ func createTestPod(name string, kvCacheUsage float64, runningQueueSize, waitingQ
 		},
 		MetricsState: &backendmetrics.MetricsState{
 			KVCacheUsagePercent: kvCacheUsage,
-			RunningQueueSize:    runningQueueSize,
+			RunningRequestsSize: runningRequestsSize,
 			WaitingQueueSize:    waitingQueueSize,
 		},
 	}
diff --git a/pkg/epp/scheduling/framework/plugins/scorer/running.go b/pkg/epp/scheduling/framework/plugins/scorer/running.go
@@ -28,56 +28,56 @@ import (
 )
 
 const (
-	RunningQueueSizeScorerType = "running-queue-size-scorer"
+	RunningRequestsSizeScorerType = "running-requests-size-scorer"
 )
 
 // compile-time type assertion
-var _ framework.Scorer = &RunningQueueSizeScorer{}
+var _ framework.Scorer = &RunningRequestsSizeScorer{}
 
-// RunningQueueSizeScorerFactory defines the factory function for RunningQueueSizeScorer.
-func RunningQueueSizeScorerFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
-	return NewRunningQueueSizeScorer().WithName(name), nil
+// RunningRequestsSizeScorerFactory defines the factory function for RunningRequestsSizeScorer.
+func RunningRequestsSizeScorerFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
+	return NewRunningRequestsSizeScorer().WithName(name), nil
 }
 
-// NewRunningQueueSizeScorer initializes a new RunningQueueSizeScorer and returns its pointer.
-func NewRunningQueueSizeScorer() *RunningQueueSizeScorer {
-	return &RunningQueueSizeScorer{
-		typedName: plugins.TypedName{Type: RunningQueueSizeScorerType, Name: RunningQueueSizeScorerType},
+// NewRunningRequestsSizeScorer initializes a new RunningRequestsSizeScorer and returns its pointer.
+func NewRunningRequestsSizeScorer() *RunningRequestsSizeScorer {
+	return &RunningRequestsSizeScorer{
+		typedName: plugins.TypedName{Type: RunningRequestsSizeScorerType, Name: RunningRequestsSizeScorerType},
 	}
 }
 
-// RunningQueueSizeScorer scores list of candidate pods based on the pod's running request size.
+// RunningRequestsSizeScorer scores list of candidate pods based on the pod's running request size.
 // the less running request size the pod has, the higher score it will get (since it's more available to serve new request).
-type RunningQueueSizeScorer struct {
+type RunningRequestsSizeScorer struct {
 	typedName plugins.TypedName
 }
 
 // TypedName returns the type and name tuple of this plugin instance.
-func (s *RunningQueueSizeScorer) TypedName() plugins.TypedName {
+func (s *RunningRequestsSizeScorer) TypedName() plugins.TypedName {
 	return s.typedName
 }
 
 // Consumes returns the list of data that is consumed by the plugin.
-func (s *RunningQueueSizeScorer) Consumes() map[string]any {
+func (s *RunningRequestsSizeScorer) Consumes() map[string]any {
 	return map[string]any{
-		metrics.RunningQueueSizeKey: int(0),
+		metrics.RunningRequestsSizeKey: int(0),
 	}
 }
 
 // WithName sets the name of the scorer.
-func (s *RunningQueueSizeScorer) WithName(name string) *RunningQueueSizeScorer {
+func (s *RunningRequestsSizeScorer) WithName(name string) *RunningRequestsSizeScorer {
 	s.typedName.Name = name
 	return s
 }
 
 // Score returns the scoring result for the given list of pods based on context.
-func (s *RunningQueueSizeScorer) Score(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
+func (s *RunningRequestsSizeScorer) Score(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
 	minQueueSize := math.MaxInt
 	maxQueueSize := math.MinInt
 
 	// Iterate through the remaining pods to find min and max
 	for _, pod := range pods {
-		queueSize := pod.GetMetrics().RunningQueueSize
+		queueSize := pod.GetMetrics().RunningRequestsSize
 		if queueSize < minQueueSize {
 			minQueueSize = queueSize
 		}
@@ -92,7 +92,7 @@ func (s *RunningQueueSizeScorer) Score(_ context.Context, _ *types.CycleState, _
 			// If all pods have the same queue size, return a neutral score
 			return 1.0
 		}
-		return float64(maxQueueSize-pod.GetMetrics().RunningQueueSize) / float64(maxQueueSize-minQueueSize)
+		return float64(maxQueueSize-pod.GetMetrics().RunningRequestsSize) / float64(maxQueueSize-minQueueSize)
 	}
 
 	// Create a map to hold the scores for each pod
diff --git a/pkg/epp/scheduling/framework/plugins/scorer/running_test.go b/pkg/epp/scheduling/framework/plugins/scorer/running_test.go
@@ -27,7 +27,7 @@ import (
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 )
 
-func TestRunningQueueSizeScorer(t *testing.T) {
+func TestRunningRequestsSizeScorer(t *testing.T) {
 	tests := []struct {
 		name              string
 		pods              []types.Pod
@@ -36,9 +36,9 @@ func TestRunningQueueSizeScorer(t *testing.T) {
 		{
 			name: "Different running queue sizes",
 			pods: []types.Pod{
-				&types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningQueueSize: 10}},
-				&types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningQueueSize: 5}},
-				&types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningQueueSize: 0}},
+				&types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningRequestsSize: 10}},
+				&types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningRequestsSize: 5}},
+				&types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningRequestsSize: 0}},
 			},
 			expectedScoresPod: map[int]float64{
 				0: 0.0, // Longest queue (10) gets lowest score
@@ -49,8 +49,8 @@ func TestRunningQueueSizeScorer(t *testing.T) {
 		{
 			name: "Same running queue sizes",
 			pods: []types.Pod{
-				&types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningQueueSize: 5}},
-				&types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningQueueSize: 5}},
+				&types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningRequestsSize: 5}},
+				&types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningRequestsSize: 5}},
 			},
 			expectedScoresPod: map[int]float64{
 				0: 1.0, // When all pods have the same queue size, they get the same neutral score
@@ -60,8 +60,8 @@ func TestRunningQueueSizeScorer(t *testing.T) {
 		{
 			name: "Zero running queue sizes",
 			pods: []types.Pod{
-				&types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningQueueSize: 0}},
-				&types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningQueueSize: 0}},
+				&types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningRequestsSize: 0}},
+				&types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{RunningRequestsSize: 0}},
 			},
 			expectedScoresPod: map[int]float64{
 				0: 1.0,
@@ -70,7 +70,7 @@ func TestRunningQueueSizeScorer(t *testing.T) {
 		},
 	}
 
-	scorer := &RunningQueueSizeScorer{}
+	scorer := &RunningRequestsSizeScorer{}
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {

Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics(`
`100`	`100`	`if p.MetricMapping.TotalRunningRequests != nil {`
`101`	`101`	`running, err := p.getMetric(metricFamilies, *p.MetricMapping.TotalRunningRequests)`
`102`	`102`	`if err == nil {`
`103`		`- updated.RunningQueueSize = int(running.GetGauge().GetValue())`
	`103`	`+ updated.RunningRequestsSize = int(running.GetGauge().GetValue())`
`104`	`104`	`} else {`
`105`	`105`	`errs = multierr.Append(errs, err)`
`106`	`106`	`}`