Skip to content

Commit eabe076

Browse files
committed
koordlet: revise noderesource and podresource when device collector disabled
Signed-off-by: saintube <saintube@foxmail.com>
1 parent 83a18bf commit eabe076

File tree

4 files changed

+94
-5
lines changed

4 files changed

+94
-5
lines changed

pkg/koordlet/metricsadvisor/collectors/noderesource/node_resource_collector.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,15 @@ func (n *nodeResourceCollector) collectNodeResUsed() {
126126
}
127127
nodeMetrics = append(nodeMetrics, cpuUsageMetrics)
128128

129-
for _, deviceCollector := range n.deviceCollectors {
130-
if metric, _ := deviceCollector.GetNodeMetric(); metric != nil {
129+
for name, deviceCollector := range n.deviceCollectors {
130+
if !deviceCollector.Enabled() {
131+
klog.V(6).Infof("skip node metrics from the disabled device collector %s", name)
132+
continue
133+
}
134+
135+
if metric, err := deviceCollector.GetNodeMetric(); err != nil {
136+
klog.Warningf("get node metrics from the device collector %s failed, err: %s", name, err)
137+
} else {
131138
nodeMetrics = append(nodeMetrics, metric...)
132139
}
133140
if info := deviceCollector.Infos(); info != nil {

pkg/koordlet/metricsadvisor/collectors/noderesource/node_resource_collector_test.go

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,14 +134,15 @@ DirectMap1G: 0 kB`)
134134
CPUTick: 0,
135135
Timestamp: testLastCPUStatTime,
136136
}
137+
testDeviceCollector := &fakeDeviceCollector{isEnabled: true}
137138

138139
c := &nodeResourceCollector{
139140
started: atomic.NewBool(false),
140141
appendableDB: metricCache,
141142
metricDB: metricCache,
142143
lastNodeCPUStat: testLastCPUStat,
143144
deviceCollectors: map[string]framework.DeviceCollector{
144-
"TestDeviceCollector": &fakeDeviceCollector{},
145+
"TestDeviceCollector": testDeviceCollector,
145146
},
146147
sharedState: framework.NewSharedState(),
147148
}
@@ -167,6 +168,24 @@ DirectMap1G: 0 kB`)
167168
assert.Equal(t, wantCPU, nodeCPU.Value)
168169
assert.Equal(t, wantMemory, nodeMemory.Value)
169170

171+
// test collect without device collector
172+
testDeviceCollector.isEnabled = false
173+
testDeviceCollector.getNodeMetric = func() ([]metriccache.MetricSample, error) {
174+
panic("should not be called")
175+
}
176+
assert.NotPanics(t, func() {
177+
c.collectNodeResUsed()
178+
})
179+
assert.True(t, c.Started())
180+
// validate collected values
181+
// assert collected time is less than 10s
182+
got, err = testGetNodeMetrics(t, c.metricDB, testNow, 5*time.Second)
183+
wantCPU = testCPUUsage
184+
assert.Equal(t, wantCPU, float64(got.Cpu().MilliValue()/1000))
185+
// MemTotal - MemAvailable
186+
wantMemory = float64(524288 * 1024)
187+
assert.Equal(t, wantMemory, float64(got.Memory().Value()))
188+
170189
// test first cpu collection
171190
c.lastNodeCPUStat = nil
172191
assert.NotPanics(t, func() {
@@ -185,9 +204,18 @@ DirectMap1G: 0 kB`)
185204

186205
type fakeDeviceCollector struct {
187206
framework.DeviceCollector
207+
isEnabled bool
208+
getNodeMetric func() ([]metriccache.MetricSample, error)
209+
}
210+
211+
func (f *fakeDeviceCollector) Enabled() bool {
212+
return f.isEnabled
188213
}
189214

190215
func (f *fakeDeviceCollector) GetNodeMetric() ([]metriccache.MetricSample, error) {
216+
if f.getNodeMetric != nil {
217+
return f.getNodeMetric()
218+
}
191219
return nil, nil
192220
}
193221

pkg/koordlet/metricsadvisor/collectors/podresource/pod_resource_collector.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,11 @@ func (p *podResourceCollector) collectPodResUsed() {
164164

165165
metrics = append(metrics, cpuUsageMetric, memUsageMetric)
166166
for deviceName, deviceCollector := range p.deviceCollectors {
167+
if !deviceCollector.Enabled() {
168+
klog.V(6).Infof("skip pod metrics from the disabled device collector %s, pod %s", deviceName, podKey)
169+
continue
170+
}
171+
167172
if deviceMetrics, err := deviceCollector.GetPodMetric(uid, meta.CgroupDir, pod.Status.ContainerStatuses); err != nil {
168173
klog.V(4).Infof("get pod %s device usage failed for %v, error: %v", podKey, deviceName, err)
169174
} else if len(metrics) > 0 {
@@ -262,6 +267,11 @@ func (p *podResourceCollector) collectContainerResUsed(meta *statesinformer.PodM
262267
containerMetrics = append(containerMetrics, cpuUsageMetric, memUsageMetric)
263268

264269
for deviceName, deviceCollector := range p.deviceCollectors {
270+
if !deviceCollector.Enabled() {
271+
klog.V(6).Infof("skip container metrics from the disabled device collector %s, container %s", deviceName, containerKey)
272+
continue
273+
}
274+
265275
if metrics, err := deviceCollector.GetContainerMetric(containerStat.ContainerID, meta.CgroupDir, containerStat); err != nil {
266276
klog.Warningf("get container %s device usage failed for %v, error: %v", containerKey, deviceName, err)
267277
} else {

pkg/koordlet/metricsadvisor/collectors/podresource/pod_resource_collector_test.go

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ func Test_collector_collectPodResUsed(t *testing.T) {
7878
}
7979
type fields struct {
8080
podFilterOption framework.PodFilter
81+
deviceCollectors map[string]framework.DeviceCollector
8182
getPodMetas []*statesinformer.PodMeta
8283
initPodLastStat func(lastState *gocache.Cache)
8384
initContainerLastStat func(lastState *gocache.Cache)
@@ -96,6 +97,9 @@ func Test_collector_collectPodResUsed(t *testing.T) {
9697
name: "cgroups v1",
9798
fields: fields{
9899
podFilterOption: framework.DefaultPodFilter,
100+
deviceCollectors: map[string]framework.DeviceCollector{
101+
"TestDeviceCollector": &fakeDeviceCollector{isEnabled: true},
102+
},
99103
getPodMetas: []*statesinformer.PodMeta{
100104
{
101105
CgroupDir: testPodMetaDir,
@@ -150,6 +154,9 @@ total_unevictable 0
150154
name: "cgroups v2",
151155
fields: fields{
152156
podFilterOption: framework.DefaultPodFilter,
157+
deviceCollectors: map[string]framework.DeviceCollector{
158+
"TestDeviceCollector": &fakeDeviceCollector{isEnabled: true},
159+
},
153160
getPodMetas: []*statesinformer.PodMeta{
154161
{
155162
CgroupDir: testPodMetaDir,
@@ -212,9 +219,20 @@ unevictable 0
212219
},
213220
},
214221
{
215-
name: "cgroups v1, filter non-running pods",
222+
name: "cgroups v1, filter non-running pods, skip disabled device collector",
216223
fields: fields{
217224
podFilterOption: &framework.TerminatedPodFilter{},
225+
deviceCollectors: map[string]framework.DeviceCollector{
226+
"TestDeviceCollector": &fakeDeviceCollector{
227+
isEnabled: false,
228+
getPodMetric: func(uid, podParentDir string, cs []corev1.ContainerStatus) ([]metriccache.MetricSample, error) {
229+
panic("should not be called")
230+
},
231+
getContainerMetric: func(uid, podParentDir string, c *corev1.ContainerStatus) ([]metriccache.MetricSample, error) {
232+
panic("should not be called")
233+
},
234+
},
235+
},
218236
getPodMetas: []*statesinformer.PodMeta{
219237
{
220238
CgroupDir: testPodMetaDir,
@@ -304,7 +322,8 @@ total_unevictable 0
304322
},
305323
})
306324
collector.Setup(&framework.Context{
307-
State: framework.NewSharedState(),
325+
State: framework.NewSharedState(),
326+
DeviceCollectors: tt.fields.deviceCollectors,
308327
})
309328
c := collector.(*podResourceCollector)
310329
tt.fields.initPodLastStat(c.lastPodCPUStat)
@@ -350,3 +369,28 @@ func Test_podResourceCollector_Run(t *testing.T) {
350369
close(stopCh)
351370
})
352371
}
372+
373+
type fakeDeviceCollector struct {
374+
framework.DeviceCollector
375+
isEnabled bool
376+
getPodMetric func(uid, podParentDir string, cs []corev1.ContainerStatus) ([]metriccache.MetricSample, error)
377+
getContainerMetric func(uid, podParentDir string, c *corev1.ContainerStatus) ([]metriccache.MetricSample, error)
378+
}
379+
380+
func (f *fakeDeviceCollector) Enabled() bool {
381+
return f.isEnabled
382+
}
383+
384+
func (f *fakeDeviceCollector) GetPodMetric(uid, podParentDir string, cs []corev1.ContainerStatus) ([]metriccache.MetricSample, error) {
385+
if f.getPodMetric != nil {
386+
return f.getPodMetric(uid, podParentDir, cs)
387+
}
388+
return nil, nil
389+
}
390+
391+
func (f *fakeDeviceCollector) GetContainerMetric(containerID, podParentDir string, c *corev1.ContainerStatus) ([]metriccache.MetricSample, error) {
392+
if f.getContainerMetric != nil {
393+
return f.getContainerMetric(containerID, podParentDir, c)
394+
}
395+
return nil, nil
396+
}

0 commit comments

Comments
 (0)