Skip to content

Commit ad36a0b

Browse files
authored
koordlet: revise base collectors and system status check (#1877)
Signed-off-by: saintube <saintube@foxmail.com>
1 parent b547abb commit ad36a0b

File tree

11 files changed

+163
-35
lines changed

11 files changed

+163
-35
lines changed

pkg/koordlet/koordlet.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ func NewDaemon(config *config.Configuration) (Daemon, error) {
7676
klog.Infof("NODE_NAME is %v, start time %v", nodeName, float64(time.Now().Unix()))
7777
metrics.RecordKoordletStartTime(nodeName, float64(time.Now().Unix()))
7878

79+
system.InitSupportConfigs()
7980
klog.Infof("sysconf: %+v, agentMode: %v", system.Conf, system.AgentMode)
8081
klog.Infof("kernel version INFO: %+v", system.HostSystemInfo)
8182

pkg/koordlet/metricsadvisor/collectors/noderesource/node_resource_collector.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,15 @@ func (n *nodeResourceCollector) collectNodeResUsed() {
126126
}
127127
nodeMetrics = append(nodeMetrics, cpuUsageMetrics)
128128

129-
for _, deviceCollector := range n.deviceCollectors {
130-
if metric, _ := deviceCollector.GetNodeMetric(); metric != nil {
129+
for name, deviceCollector := range n.deviceCollectors {
130+
if !deviceCollector.Enabled() {
131+
klog.V(6).Infof("skip node metrics from the disabled device collector %s", name)
132+
continue
133+
}
134+
135+
if metric, err := deviceCollector.GetNodeMetric(); err != nil {
136+
klog.Warningf("get node metrics from the device collector %s failed, err: %s", name, err)
137+
} else {
131138
nodeMetrics = append(nodeMetrics, metric...)
132139
}
133140
if info := deviceCollector.Infos(); info != nil {

pkg/koordlet/metricsadvisor/collectors/noderesource/node_resource_collector_test.go

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,14 +134,15 @@ DirectMap1G: 0 kB`)
134134
CPUTick: 0,
135135
Timestamp: testLastCPUStatTime,
136136
}
137+
testDeviceCollector := &fakeDeviceCollector{isEnabled: true}
137138

138139
c := &nodeResourceCollector{
139140
started: atomic.NewBool(false),
140141
appendableDB: metricCache,
141142
metricDB: metricCache,
142143
lastNodeCPUStat: testLastCPUStat,
143144
deviceCollectors: map[string]framework.DeviceCollector{
144-
"TestDeviceCollector": &fakeDeviceCollector{},
145+
"TestDeviceCollector": testDeviceCollector,
145146
},
146147
sharedState: framework.NewSharedState(),
147148
}
@@ -167,6 +168,24 @@ DirectMap1G: 0 kB`)
167168
assert.Equal(t, wantCPU, nodeCPU.Value)
168169
assert.Equal(t, wantMemory, nodeMemory.Value)
169170

171+
// test collect without device collector
172+
testDeviceCollector.isEnabled = false
173+
testDeviceCollector.getNodeMetric = func() ([]metriccache.MetricSample, error) {
174+
panic("should not be called")
175+
}
176+
assert.NotPanics(t, func() {
177+
c.collectNodeResUsed()
178+
})
179+
assert.True(t, c.Started())
180+
// validate collected values
181+
// assert collected time is less than 10s
182+
got, err = testGetNodeMetrics(t, c.metricDB, testNow, 5*time.Second)
183+
wantCPU = testCPUUsage
184+
assert.Equal(t, wantCPU, float64(got.Cpu().MilliValue()/1000))
185+
// MemTotal - MemAvailable
186+
wantMemory = float64(524288 * 1024)
187+
assert.Equal(t, wantMemory, float64(got.Memory().Value()))
188+
170189
// test first cpu collection
171190
c.lastNodeCPUStat = nil
172191
assert.NotPanics(t, func() {
@@ -185,9 +204,18 @@ DirectMap1G: 0 kB`)
185204

186205
type fakeDeviceCollector struct {
187206
framework.DeviceCollector
207+
isEnabled bool
208+
getNodeMetric func() ([]metriccache.MetricSample, error)
209+
}
210+
211+
func (f *fakeDeviceCollector) Enabled() bool {
212+
return f.isEnabled
188213
}
189214

190215
func (f *fakeDeviceCollector) GetNodeMetric() ([]metriccache.MetricSample, error) {
216+
if f.getNodeMetric != nil {
217+
return f.getNodeMetric()
218+
}
191219
return nil, nil
192220
}
193221

pkg/koordlet/metricsadvisor/collectors/podresource/pod_resource_collector.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,11 @@ func (p *podResourceCollector) collectPodResUsed() {
164164

165165
metrics = append(metrics, cpuUsageMetric, memUsageMetric)
166166
for deviceName, deviceCollector := range p.deviceCollectors {
167+
if !deviceCollector.Enabled() {
168+
klog.V(6).Infof("skip pod metrics from the disabled device collector %s, pod %s", deviceName, podKey)
169+
continue
170+
}
171+
167172
if deviceMetrics, err := deviceCollector.GetPodMetric(uid, meta.CgroupDir, pod.Status.ContainerStatuses); err != nil {
168173
klog.V(4).Infof("get pod %s device usage failed for %v, error: %v", podKey, deviceName, err)
169174
} else if len(metrics) > 0 {
@@ -262,6 +267,11 @@ func (p *podResourceCollector) collectContainerResUsed(meta *statesinformer.PodM
262267
containerMetrics = append(containerMetrics, cpuUsageMetric, memUsageMetric)
263268

264269
for deviceName, deviceCollector := range p.deviceCollectors {
270+
if !deviceCollector.Enabled() {
271+
klog.V(6).Infof("skip container metrics from the disabled device collector %s, container %s", deviceName, containerKey)
272+
continue
273+
}
274+
265275
if metrics, err := deviceCollector.GetContainerMetric(containerStat.ContainerID, meta.CgroupDir, containerStat); err != nil {
266276
klog.Warningf("get container %s device usage failed for %v, error: %v", containerKey, deviceName, err)
267277
} else {

pkg/koordlet/metricsadvisor/collectors/podresource/pod_resource_collector_test.go

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ func Test_collector_collectPodResUsed(t *testing.T) {
7878
}
7979
type fields struct {
8080
podFilterOption framework.PodFilter
81+
deviceCollectors map[string]framework.DeviceCollector
8182
getPodMetas []*statesinformer.PodMeta
8283
initPodLastStat func(lastState *gocache.Cache)
8384
initContainerLastStat func(lastState *gocache.Cache)
@@ -96,6 +97,9 @@ func Test_collector_collectPodResUsed(t *testing.T) {
9697
name: "cgroups v1",
9798
fields: fields{
9899
podFilterOption: framework.DefaultPodFilter,
100+
deviceCollectors: map[string]framework.DeviceCollector{
101+
"TestDeviceCollector": &fakeDeviceCollector{isEnabled: true},
102+
},
99103
getPodMetas: []*statesinformer.PodMeta{
100104
{
101105
CgroupDir: testPodMetaDir,
@@ -150,6 +154,9 @@ total_unevictable 0
150154
name: "cgroups v2",
151155
fields: fields{
152156
podFilterOption: framework.DefaultPodFilter,
157+
deviceCollectors: map[string]framework.DeviceCollector{
158+
"TestDeviceCollector": &fakeDeviceCollector{isEnabled: true},
159+
},
153160
getPodMetas: []*statesinformer.PodMeta{
154161
{
155162
CgroupDir: testPodMetaDir,
@@ -212,9 +219,20 @@ unevictable 0
212219
},
213220
},
214221
{
215-
name: "cgroups v1, filter non-running pods",
222+
name: "cgroups v1, filter non-running pods, skip disabled device collector",
216223
fields: fields{
217224
podFilterOption: &framework.TerminatedPodFilter{},
225+
deviceCollectors: map[string]framework.DeviceCollector{
226+
"TestDeviceCollector": &fakeDeviceCollector{
227+
isEnabled: false,
228+
getPodMetric: func(uid, podParentDir string, cs []corev1.ContainerStatus) ([]metriccache.MetricSample, error) {
229+
panic("should not be called")
230+
},
231+
getContainerMetric: func(uid, podParentDir string, c *corev1.ContainerStatus) ([]metriccache.MetricSample, error) {
232+
panic("should not be called")
233+
},
234+
},
235+
},
218236
getPodMetas: []*statesinformer.PodMeta{
219237
{
220238
CgroupDir: testPodMetaDir,
@@ -304,7 +322,8 @@ total_unevictable 0
304322
},
305323
})
306324
collector.Setup(&framework.Context{
307-
State: framework.NewSharedState(),
325+
State: framework.NewSharedState(),
326+
DeviceCollectors: tt.fields.deviceCollectors,
308327
})
309328
c := collector.(*podResourceCollector)
310329
tt.fields.initPodLastStat(c.lastPodCPUStat)
@@ -350,3 +369,28 @@ func Test_podResourceCollector_Run(t *testing.T) {
350369
close(stopCh)
351370
})
352371
}
372+
373+
type fakeDeviceCollector struct {
374+
framework.DeviceCollector
375+
isEnabled bool
376+
getPodMetric func(uid, podParentDir string, cs []corev1.ContainerStatus) ([]metriccache.MetricSample, error)
377+
getContainerMetric func(uid, podParentDir string, c *corev1.ContainerStatus) ([]metriccache.MetricSample, error)
378+
}
379+
380+
func (f *fakeDeviceCollector) Enabled() bool {
381+
return f.isEnabled
382+
}
383+
384+
func (f *fakeDeviceCollector) GetPodMetric(uid, podParentDir string, cs []corev1.ContainerStatus) ([]metriccache.MetricSample, error) {
385+
if f.getPodMetric != nil {
386+
return f.getPodMetric(uid, podParentDir, cs)
387+
}
388+
return nil, nil
389+
}
390+
391+
func (f *fakeDeviceCollector) GetContainerMetric(containerID, podParentDir string, c *corev1.ContainerStatus) ([]metriccache.MetricSample, error) {
392+
if f.getContainerMetric != nil {
393+
return f.getContainerMetric(containerID, podParentDir, c)
394+
}
395+
return nil, nil
396+
}

pkg/koordlet/runtimehooks/runtimehooks.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ func NewRuntimeHook(si statesinformer.StatesInformer, cfg *Config) (RuntimeHook,
111111
}
112112
nriServer, err = nri.NewNriServer(nriServerOptions)
113113
if err != nil {
114-
klog.Errorf("new nri mode runtimehooks server error: %v", err)
114+
klog.Warningf("new nri mode runtimehooks server error: %v", err)
115115
}
116116
} else {
117117
klog.V(4).Info("nri mode runtimehooks is disabled")

pkg/koordlet/util/system/config.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"os"
2222

2323
"go.uber.org/atomic"
24+
"k8s.io/klog/v2"
2425
)
2526

2627
const (
@@ -57,10 +58,20 @@ func init() {
5758
}
5859
}
5960

60-
func initSupportConfigs() {
61+
// InitSupportConfigs initializes the system support status.
62+
// e.g. the cgroup version, resctrl capability
63+
func InitSupportConfigs() {
64+
// $ getconf CLK_TCK > jiffies
65+
if err := initJiffies(); err != nil {
66+
klog.Warningf("failed to get Jiffies, use the default %v, err: %v", Jiffies, err)
67+
}
6168
initCgroupsVersion()
6269
HostSystemInfo = collectVersionInfo()
63-
_, _ = IsSupportResctrl()
70+
if isResctrlSupported, err := IsSupportResctrl(); err != nil {
71+
klog.Warningf("failed to check resctrl support status, use %d, err: %v", isResctrlSupported, err)
72+
} else {
73+
klog.V(4).Infof("resctrl supported: %v", isResctrlSupported)
74+
}
6475
}
6576

6677
func NewHostModeConfig() *Config {
@@ -110,6 +121,4 @@ func (c *Config) InitFlags(fs *flag.FlagSet) {
110121
fs.StringVar(&c.PouchEndpoint, "pouch-endpoint", c.PouchEndpoint, "pouch endPoint")
111122

112123
fs.StringVar(&c.DefaultRuntimeType, "default-runtime-type", c.DefaultRuntimeType, "default runtime type during runtime hooks handle request, candidates are containerd/docker/pouch.")
113-
114-
initSupportConfigs()
115124
}

pkg/koordlet/util/system/config_test.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,54 @@ func Test_InitFlags(t *testing.T) {
6060
assert.NotNil(t, cfg)
6161
})
6262
}
63+
64+
func Test_InitSupportConfigs(t *testing.T) {
65+
type fields struct {
66+
prepareFn func(helper *FileTestUtil)
67+
}
68+
type expects struct {
69+
hostSystemInfo VersionInfo
70+
}
71+
tests := []struct {
72+
name string
73+
fields fields
74+
expects expects
75+
}{
76+
{
77+
name: "not anolis os, not support resctrl",
78+
fields: fields{
79+
prepareFn: func(helper *FileTestUtil) {},
80+
},
81+
expects: expects{
82+
hostSystemInfo: VersionInfo{
83+
IsAnolisOS: false,
84+
},
85+
},
86+
},
87+
{
88+
name: "anolis os, not support resctrl",
89+
fields: fields{
90+
prepareFn: func(helper *FileTestUtil) {
91+
bvtResource, _ := GetCgroupResource(CPUBVTWarpNsName)
92+
helper.WriteCgroupFileContents("", bvtResource, "0")
93+
},
94+
},
95+
expects: expects{
96+
hostSystemInfo: VersionInfo{
97+
IsAnolisOS: true,
98+
},
99+
},
100+
},
101+
}
102+
for _, tt := range tests {
103+
t.Run(tt.name, func(t *testing.T) {
104+
helper := NewFileTestUtil(t)
105+
defer helper.Cleanup()
106+
if tt.fields.prepareFn != nil {
107+
tt.fields.prepareFn(helper)
108+
}
109+
InitSupportConfigs()
110+
assert.Equal(t, tt.expects.hostSystemInfo, HostSystemInfo)
111+
})
112+
}
113+
}

pkg/koordlet/util/system/resctrl.go

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ func IsSupportResctrl() (bool, error) {
103103
return false, err
104104
}
105105
// Kernel cmdline not set resctrl features does not ensure feature must be disabled.
106-
klog.V(4).Infof("isResctrlAvailableByKernelCmd result, cpuSupport: %v, kernelSupport: %v",
106+
klog.Infof("IsSupportResctrl result, cpuSupport: %v, kernelSupport: %v",
107107
cpuSupport, kernelCmdSupport)
108108
isSupportResctrl = cpuSupport
109109
isInit = true
@@ -560,21 +560,6 @@ func CheckAndTryEnableResctrlCat() error {
560560
return nil
561561
}
562562

563-
func InitCatGroupIfNotExist(group string) error {
564-
path := GetResctrlGroupRootDirPath(group)
565-
_, err := os.Stat(path)
566-
if err == nil {
567-
return nil
568-
} else if !os.IsNotExist(err) {
569-
return fmt.Errorf("check dir %v for group %s but got unexpected err: %v", path, group, err)
570-
}
571-
err = os.Mkdir(path, 0755)
572-
if err != nil {
573-
return fmt.Errorf("create dir %v failed for group %s, err: %v", path, group, err)
574-
}
575-
return nil
576-
}
577-
578563
func CheckResctrlSchemataValid() error {
579564
schemataPath := GetResctrlSchemataFilePath("")
580565
schemataRaw, err := ReadResctrlSchemataRaw(schemataPath, -1)

pkg/koordlet/util/system/system_file.go

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,19 +51,12 @@ var (
5151
Jiffies = float64(10 * time.Millisecond)
5252
)
5353

54-
func init() {
55-
// $ getconf CLK_TCK > jiffies
56-
if err := initJiffies(); err != nil {
57-
klog.Warningf("failed to get Jiffies, use the default %v, err: %v", Jiffies, err)
58-
}
59-
}
60-
6154
// initJiffies use command "getconf CLK_TCK" to fetch the clock tick on current host,
6255
// if the command doesn't exist, uses the default value 10ms for jiffies
6356
func initJiffies() error {
6457
getconf, err := exec.LookPath("getconf")
6558
if err != nil {
66-
return nil
59+
return err
6760
}
6861
cmd := exec.Command(getconf, "CLK_TCK")
6962
var out bytes.Buffer

0 commit comments

Comments
 (0)