Skip to content

Commit 08506ae

Browse files
saintubelucming
authored andcommitted
koordlet: add metrics for batch resources (koordinator-sh#913)
Signed-off-by: saintube <saintube@foxmail.com>
1 parent 731a99d commit 08506ae

18 files changed

+593
-68
lines changed

pkg/koordlet/koordlet.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ func (d *daemon) Run(stopCh <-chan struct{}) {
199199

200200
go func() {
201201
if err := d.runtimeHook.Run(stopCh); err != nil {
202-
klog.Errorf("Unable to run the runtimeHook: ", err)
202+
klog.Error("Unable to run the runtimeHook: ", err)
203203
os.Exit(1)
204204
}
205205
}()

pkg/koordlet/metrics/common.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ var (
2626
Name: "start_time",
2727
Help: "the start time of koordlet",
2828
}, []string{NodeKey})
29+
2930
CollectNodeCPUInfoStatus = prometheus.NewCounterVec(prometheus.CounterOpts{
3031
Subsystem: KoordletSubsystem,
3132
Name: "collect_node_cpu_info_status",

pkg/koordlet/metrics/metrics.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626

2727
func init() {
2828
prometheus.MustRegister(CommonCollectors...)
29+
prometheus.MustRegister(ResourceSummaryCollectors...)
2930
prometheus.MustRegister(CPICollectors...)
3031
prometheus.MustRegister(PSICollectors...)
3132
prometheus.MustRegister(CPUSuppressCollector...)
@@ -38,7 +39,7 @@ const (
3839
NodeKey = "node"
3940

4041
StatusKey = "status"
41-
StatusSucceed = "succeed"
42+
StatusSucceed = "succeeded"
4243
StatusFailed = "failed"
4344

4445
EvictionReasonKey = "reason"
@@ -50,6 +51,8 @@ const (
5051
PodUID = "pod_uid"
5152
PodName = "pod_name"
5253
PodNamespace = "pod_namespace"
54+
55+
ResourceKey = "resource"
5356
)
5457

5558
var (

pkg/koordlet/metrics/metrics_test.go

Lines changed: 108 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@ import (
2626
"k8s.io/apimachinery/pkg/api/resource"
2727
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2828

29+
apiext "github.com/koordinator-sh/koordinator/apis/extension"
2930
koordletutil "github.com/koordinator-sh/koordinator/pkg/koordlet/util"
3031
"github.com/koordinator-sh/koordinator/pkg/koordlet/util/system"
32+
"github.com/koordinator-sh/koordinator/pkg/util"
3133
)
3234

3335
func TestGenNodeLabels(t *testing.T) {
@@ -131,8 +133,8 @@ func TestCommonCollectors(t *testing.T) {
131133
RecordCollectNodeCPUInfoStatus(testingErr)
132134
RecordCollectNodeCPUInfoStatus(nil)
133135
RecordBESuppressCores("cfsQuota", float64(1000))
134-
RecordBESuppressLSUsedCPU(float64(1000))
135-
RecordNodeUsedCPU(float64(2000))
136+
RecordBESuppressLSUsedCPU(1.0)
137+
RecordNodeUsedCPU(2.0)
136138
RecordContainerScaledCFSBurstUS(testingPod.Namespace, testingPod.Name, testingContainer.ContainerID, testingContainer.Name, 1000000)
137139
RecordContainerScaledCFSQuotaUS(testingPod.Namespace, testingPod.Name, testingContainer.ContainerID, testingContainer.Name, 1000000)
138140
RecordPodEviction("evictByCPU")
@@ -144,3 +146,107 @@ func TestCommonCollectors(t *testing.T) {
144146
RecordPodPSI(testingPod, testingPSI)
145147
})
146148
}
149+
150+
func TestResourceSummaryCollectors(t *testing.T) {
151+
testingNode := &corev1.Node{
152+
ObjectMeta: metav1.ObjectMeta{
153+
Name: "test-node",
154+
Labels: map[string]string{},
155+
},
156+
Status: corev1.NodeStatus{
157+
Allocatable: corev1.ResourceList{
158+
corev1.ResourceCPU: resource.MustParse("100"),
159+
corev1.ResourceMemory: resource.MustParse("200Gi"),
160+
apiext.BatchCPU: resource.MustParse("50000"),
161+
apiext.BatchMemory: resource.MustParse("80Gi"),
162+
},
163+
Capacity: corev1.ResourceList{
164+
corev1.ResourceCPU: resource.MustParse("100"),
165+
corev1.ResourceMemory: resource.MustParse("200Gi"),
166+
apiext.BatchCPU: resource.MustParse("50000"),
167+
apiext.BatchMemory: resource.MustParse("80Gi"),
168+
},
169+
},
170+
}
171+
testingPod := &corev1.Pod{
172+
ObjectMeta: metav1.ObjectMeta{
173+
Name: "test_pod",
174+
Namespace: "test_pod_namespace",
175+
UID: "test01",
176+
},
177+
Spec: corev1.PodSpec{
178+
Containers: []corev1.Container{
179+
{
180+
Name: "test_container",
181+
Resources: corev1.ResourceRequirements{
182+
Requests: corev1.ResourceList{
183+
corev1.ResourceCPU: resource.MustParse("1"),
184+
corev1.ResourceMemory: resource.MustParse("2Gi"),
185+
},
186+
Limits: corev1.ResourceList{
187+
corev1.ResourceCPU: resource.MustParse("2"),
188+
corev1.ResourceMemory: resource.MustParse("4Gi"),
189+
},
190+
},
191+
},
192+
},
193+
},
194+
Status: corev1.PodStatus{
195+
ContainerStatuses: []corev1.ContainerStatus{
196+
{
197+
Name: "test_container",
198+
ContainerID: "containerd://testxxx",
199+
},
200+
},
201+
},
202+
}
203+
testingBatchPod := &corev1.Pod{
204+
ObjectMeta: metav1.ObjectMeta{
205+
Name: "test_batch_pod",
206+
Namespace: "test_batch_pod_namespace",
207+
UID: "batch01",
208+
},
209+
Spec: corev1.PodSpec{
210+
Containers: []corev1.Container{
211+
{
212+
Name: "test_batch_container",
213+
Resources: corev1.ResourceRequirements{
214+
Requests: corev1.ResourceList{
215+
apiext.BatchCPU: resource.MustParse("1000"),
216+
apiext.BatchMemory: resource.MustParse("2Gi"),
217+
},
218+
Limits: corev1.ResourceList{
219+
apiext.BatchCPU: resource.MustParse("1000"),
220+
apiext.BatchMemory: resource.MustParse("2Gi"),
221+
},
222+
},
223+
},
224+
},
225+
},
226+
Status: corev1.PodStatus{
227+
ContainerStatuses: []corev1.ContainerStatus{
228+
{
229+
Name: "test_batch_container",
230+
ContainerID: "containerd://batchxxx",
231+
},
232+
},
233+
},
234+
}
235+
236+
t.Run("test not panic", func(t *testing.T) {
237+
Register(testingNode)
238+
defer Register(nil)
239+
240+
RecordNodeResourceAllocatable(string(apiext.BatchCPU), float64(util.QuantityPtr(testingNode.Status.Allocatable[apiext.BatchCPU]).Value()))
241+
RecordNodeResourceAllocatable(string(apiext.BatchMemory), float64(util.QuantityPtr(testingNode.Status.Allocatable[apiext.BatchMemory]).Value()))
242+
RecordContainerResourceRequests(string(corev1.ResourceCPU), &testingPod.Status.ContainerStatuses[0], testingPod, float64(testingPod.Spec.Containers[0].Resources.Requests.Cpu().Value()))
243+
RecordContainerResourceRequests(string(corev1.ResourceMemory), &testingPod.Status.ContainerStatuses[0], testingPod, float64(testingPod.Spec.Containers[0].Resources.Requests.Memory().Value()))
244+
RecordContainerResourceRequests(string(apiext.BatchCPU), &testingBatchPod.Status.ContainerStatuses[0], testingBatchPod, float64(util.QuantityPtr(testingBatchPod.Spec.Containers[0].Resources.Requests[apiext.BatchCPU]).Value()))
245+
RecordContainerResourceRequests(string(apiext.BatchMemory), &testingBatchPod.Status.ContainerStatuses[0], testingBatchPod, float64(util.QuantityPtr(testingBatchPod.Spec.Containers[0].Resources.Requests[apiext.BatchMemory]).Value()))
246+
RecordContainerResourceLimits(string(apiext.BatchCPU), &testingBatchPod.Status.ContainerStatuses[0], testingBatchPod, float64(util.QuantityPtr(testingBatchPod.Spec.Containers[0].Resources.Limits[apiext.BatchCPU]).Value()))
247+
RecordContainerResourceLimits(string(apiext.BatchMemory), &testingBatchPod.Status.ContainerStatuses[0], testingBatchPod, float64(util.QuantityPtr(testingBatchPod.Spec.Containers[0].Resources.Limits[apiext.BatchMemory]).Value()))
248+
249+
ResetContainerResourceRequests()
250+
ResetContainerResourceLimits()
251+
})
252+
}
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
/*
2+
Copyright 2022 The Koordinator Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package metrics
18+
19+
import (
20+
corev1 "k8s.io/api/core/v1"
21+
22+
"github.com/prometheus/client_golang/prometheus"
23+
)
24+
25+
var (
26+
NodeResourceAllocatable = prometheus.NewGaugeVec(prometheus.GaugeOpts{
27+
Subsystem: KoordletSubsystem,
28+
Name: "node_resource_allocatable",
29+
Help: "the node allocatable of resources updated by koordinator",
30+
}, []string{NodeKey, ResourceKey})
31+
32+
ContainerResourceRequests = prometheus.NewGaugeVec(prometheus.GaugeOpts{
33+
Subsystem: KoordletSubsystem,
34+
Name: "container_resource_requests",
35+
Help: "the container requests of resources updated by koordinator",
36+
}, []string{NodeKey, ResourceKey, PodUID, PodName, PodNamespace, ContainerID, ContainerName})
37+
38+
ContainerResourceLimits = prometheus.NewGaugeVec(prometheus.GaugeOpts{
39+
Subsystem: KoordletSubsystem,
40+
Name: "container_resource_limits",
41+
Help: "the container limits of resources updated by koordinator",
42+
}, []string{NodeKey, ResourceKey, PodUID, PodName, PodNamespace, ContainerID, ContainerName})
43+
44+
ResourceSummaryCollectors = []prometheus.Collector{
45+
NodeResourceAllocatable,
46+
ContainerResourceRequests,
47+
ContainerResourceLimits,
48+
}
49+
)
50+
51+
func RecordNodeResourceAllocatable(resourceName string, value float64) {
52+
labels := genNodeLabels()
53+
if labels == nil {
54+
return
55+
}
56+
labels[ResourceKey] = resourceName
57+
NodeResourceAllocatable.With(labels).Set(value)
58+
}
59+
60+
func RecordContainerResourceRequests(resourceName string, status *corev1.ContainerStatus, pod *corev1.Pod, value float64) {
61+
labels := genNodeLabels()
62+
if labels == nil {
63+
return
64+
}
65+
labels[ResourceKey] = resourceName
66+
labels[PodUID] = string(pod.UID)
67+
labels[PodName] = pod.Name
68+
labels[PodNamespace] = pod.Namespace
69+
labels[ContainerID] = status.ContainerID
70+
labels[ContainerName] = status.Name
71+
ContainerResourceRequests.With(labels).Set(value)
72+
}
73+
74+
func ResetContainerResourceRequests() {
75+
ContainerResourceRequests.Reset()
76+
}
77+
78+
func RecordContainerResourceLimits(resourceName string, status *corev1.ContainerStatus, pod *corev1.Pod, value float64) {
79+
labels := genNodeLabels()
80+
if labels == nil {
81+
return
82+
}
83+
labels[ResourceKey] = resourceName
84+
labels[PodUID] = string(pod.UID)
85+
labels[PodName] = pod.Name
86+
labels[PodNamespace] = pod.Namespace
87+
labels[ContainerID] = status.ContainerID
88+
labels[ContainerName] = status.Name
89+
ContainerResourceLimits.With(labels).Set(value)
90+
}
91+
92+
func ResetContainerResourceLimits() {
93+
ContainerResourceLimits.Reset()
94+
}

pkg/koordlet/metricsadvisor/collector.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -125,11 +125,11 @@ func (c *collector) Run(stopCh <-chan struct{}) error {
125125
go wait.Until(func() {
126126
c.collectGPUUsage()
127127
c.collectNodeResUsed()
128-
// add sync metaService cache check before collect pod information
128+
// add sync statesInformer cache check before collect pod information
129129
// because collect function will get all pods.
130130
if !cache.WaitForCacheSync(stopCh, c.statesInformer.HasSynced) {
131-
klog.Errorf("timed out waiting for meta service caches to sync")
132-
// Koordlet exit because of metaService sync failed.
131+
klog.Errorf("timed out waiting for states informer caches to sync")
132+
// Koordlet exit because of statesInformer sync failed.
133133
os.Exit(1)
134134
return
135135
}
@@ -142,11 +142,11 @@ func (c *collector) Run(stopCh <-chan struct{}) error {
142142

143143
ic := NewPerformanceCollector(c.statesInformer, c.metricCache, c.config.CPICollectorTimeWindowSeconds)
144144
util.RunFeature(func() {
145-
// add sync metaService cache check before collect pod information
145+
// add sync statesInformer cache check before collect pod information
146146
// because collect function will get all pods.
147147
if !cache.WaitForCacheSync(stopCh, c.statesInformer.HasSynced) {
148-
// Koordlet exit because of metaService sync failed.
149-
klog.Fatalf("timed out waiting for meta service caches to sync")
148+
// Koordlet exit because of statesInformer sync failed.
149+
klog.Fatalf("timed out waiting for states informer caches to sync")
150150
return
151151
}
152152
ic.collectContainerCPI()
@@ -158,11 +158,11 @@ func (c *collector) Run(stopCh <-chan struct{}) error {
158158
klog.Fatalf("collect psi fail, need anolis os")
159159
return
160160
}
161-
// add sync metaService cache check before collect pod information
161+
// add sync statesInformer cache check before collect pod information
162162
// because collect function will get all pods.
163163
if !cache.WaitForCacheSync(stopCh, c.statesInformer.HasSynced) {
164-
// Koordlet exit because of metaService sync failed.
165-
klog.Fatalf("timed out waiting for meta service caches to sync")
164+
// Koordlet exit because of statesInformer sync failed.
165+
klog.Fatalf("timed out waiting for states informer caches to sync")
166166
return
167167
}
168168
ic.collectContainerPSI()
@@ -217,7 +217,7 @@ func (c *collector) collectNodeResUsed() {
217217

218218
// update collect time
219219
c.state.RefreshTime(nodeResUsedUpdateTime)
220-
metrics.RecordNodeUsedCPU(cpuUsageValue * 1000)
220+
metrics.RecordNodeUsedCPU(cpuUsageValue) // in cpu cores
221221

222222
klog.Infof("collectNodeResUsed finished %+v", nodeMetric)
223223
}

pkg/koordlet/metricsadvisor/collector_test.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@ import (
3535

3636
func TestNewCollector(t *testing.T) {
3737
type args struct {
38-
cfg *Config
39-
metaService statesinformer.StatesInformer
40-
metricCache metriccache.MetricCache
38+
cfg *Config
39+
statesInformer statesinformer.StatesInformer
40+
metricCache metriccache.MetricCache
4141
}
4242
tests := []struct {
4343
name string
@@ -46,15 +46,15 @@ func TestNewCollector(t *testing.T) {
4646
{
4747
name: "new-collector",
4848
args: args{
49-
cfg: &Config{},
50-
metaService: nil,
51-
metricCache: nil,
49+
cfg: &Config{},
50+
statesInformer: nil,
51+
metricCache: nil,
5252
},
5353
},
5454
}
5555
for _, tt := range tests {
5656
t.Run(tt.name, func(t *testing.T) {
57-
if got := NewCollector(tt.args.cfg, tt.args.metaService, tt.args.metricCache); got == nil {
57+
if got := NewCollector(tt.args.cfg, tt.args.statesInformer, tt.args.metricCache); got == nil {
5858
t.Errorf("NewCollector() = %v", got)
5959
}
6060
})

pkg/koordlet/metricsadvisor/performance_collector_linux_test.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ import (
3737

3838
func TestNewPerformanceCollector(t *testing.T) {
3939
type args struct {
40-
cfg *Config
41-
metaService statesinformer.StatesInformer
42-
metricCache metriccache.MetricCache
43-
timeWindow int
40+
cfg *Config
41+
statesInformer statesinformer.StatesInformer
42+
metricCache metriccache.MetricCache
43+
timeWindow int
4444
}
4545
tests := []struct {
4646
name string
@@ -49,16 +49,16 @@ func TestNewPerformanceCollector(t *testing.T) {
4949
{
5050
name: "new-performance-collector",
5151
args: args{
52-
cfg: &Config{},
53-
metaService: nil,
54-
metricCache: nil,
55-
timeWindow: 10,
52+
cfg: &Config{},
53+
statesInformer: nil,
54+
metricCache: nil,
55+
timeWindow: 10,
5656
},
5757
},
5858
}
5959
for _, tt := range tests {
6060
t.Run(tt.name, func(t *testing.T) {
61-
if got := NewPerformanceCollector(tt.args.metaService, tt.args.metricCache, tt.args.timeWindow); got == nil {
61+
if got := NewPerformanceCollector(tt.args.statesInformer, tt.args.metricCache, tt.args.timeWindow); got == nil {
6262
t.Errorf("NewPerformanceCollector() = %v", got)
6363
}
6464
})

0 commit comments

Comments
 (0)