Skip to content

Commit 1bac4d3

Browse files
authored
koord-scheduler: add a parameter to mark whether scheduling is allowed on node with expired nodemetric (#2076)
Signed-off-by: lucming <2876757716@qq.com>
1 parent c159ea2 commit 1bac4d3

File tree

8 files changed

+183
-1
lines changed

8 files changed

+183
-1
lines changed

pkg/scheduler/apis/config/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ type LoadAwareSchedulingArgs struct {
3737
// When NodeMetrics expired, the node is considered abnormal.
3838
// Default is 180 seconds.
3939
NodeMetricExpirationSeconds *int64
40+
// EnableScheduleWhenNodeMetricsExpired Indicates whether nodes with expired nodeMetrics are allowed to schedule pods.
41+
EnableScheduleWhenNodeMetricsExpired *bool
4042
// ResourceWeights indicates the weights of resources.
4143
// The weights of CPU and Memory are both 1 by default.
4244
ResourceWeights map[corev1.ResourceName]int64

pkg/scheduler/apis/config/v1beta3/defaults.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ func SetDefaults_LoadAwareSchedulingArgs(obj *LoadAwareSchedulingArgs) {
7878
if obj.FilterExpiredNodeMetrics == nil {
7979
obj.FilterExpiredNodeMetrics = pointer.Bool(true)
8080
}
81+
if obj.EnableScheduleWhenNodeMetricsExpired == nil {
82+
obj.EnableScheduleWhenNodeMetricsExpired = pointer.Bool(false)
83+
}
8184
if obj.NodeMetricExpirationSeconds == nil {
8285
obj.NodeMetricExpirationSeconds = pointer.Int64(defaultNodeMetricExpirationSeconds)
8386
}

pkg/scheduler/apis/config/v1beta3/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ type LoadAwareSchedulingArgs struct {
3636
// When NodeMetrics expired, the node is considered abnormal.
3737
// Default is 180 seconds.
3838
NodeMetricExpirationSeconds *int64 `json:"nodeMetricExpirationSeconds,omitempty"`
39+
// EnableScheduleWhenNodeMetricsExpired Indicates whether nodes with expired nodeMetrics are allowed to schedule pods.
40+
EnableScheduleWhenNodeMetricsExpired *bool `json:"enableScheduleWhenNodeMetricsExpired,omitempty"`
3941
// ResourceWeights indicates the weights of resources.
4042
// The weights of CPU and Memory are both 1 by default.
4143
ResourceWeights map[corev1.ResourceName]int64 `json:"resourceWeights,omitempty"`

pkg/scheduler/apis/config/v1beta3/zz_generated.conversion.go

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/scheduler/apis/config/v1beta3/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/scheduler/apis/config/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/scheduler/plugins/loadaware/load_aware.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import (
4242

4343
const (
4444
Name = "LoadAwareScheduling"
45+
ErrReasonNodeMetricExpired = "node(s) nodeMetric expired"
4546
ErrReasonUsageExceedThreshold = "node(s) %s usage exceed threshold"
4647
ErrReasonAggregatedUsageExceedThreshold = "node(s) %s aggregated usage exceed threshold"
4748
ErrReasonFailedEstimatePod
@@ -143,6 +144,9 @@ func (p *Plugin) Filter(ctx context.Context, state *framework.CycleState, pod *c
143144

144145
if p.args.FilterExpiredNodeMetrics != nil && *p.args.FilterExpiredNodeMetrics &&
145146
p.args.NodeMetricExpirationSeconds != nil && isNodeMetricExpired(nodeMetric, *p.args.NodeMetricExpirationSeconds) {
147+
if p.args.EnableScheduleWhenNodeMetricsExpired != nil && !*p.args.EnableScheduleWhenNodeMetricsExpired {
148+
return framework.NewStatus(framework.Unschedulable, ErrReasonNodeMetricExpired)
149+
}
146150
return nil
147151
}
148152

pkg/scheduler/plugins/loadaware/load_aware_test.go

Lines changed: 160 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ func TestFilterExpiredNodeMetric(t *testing.T) {
183183
},
184184
},
185185
},
186-
wantStatus: nil,
186+
wantStatus: framework.NewStatus(framework.Unschedulable, ErrReasonNodeMetricExpired),
187187
},
188188
{
189189
name: "filter unhealthy nodeMetric with expired updateTime",
@@ -202,13 +202,172 @@ func TestFilterExpiredNodeMetric(t *testing.T) {
202202
},
203203
},
204204
},
205+
wantStatus: framework.NewStatus(framework.Unschedulable, ErrReasonNodeMetricExpired),
206+
},
207+
}
208+
for _, tt := range tests {
209+
t.Run(tt.name, func(t *testing.T) {
210+
var v1beta3args v1beta3.LoadAwareSchedulingArgs
211+
v1beta3.SetDefaults_LoadAwareSchedulingArgs(&v1beta3args)
212+
var loadAwareSchedulingArgs config.LoadAwareSchedulingArgs
213+
err := v1beta3.Convert_v1beta3_LoadAwareSchedulingArgs_To_config_LoadAwareSchedulingArgs(&v1beta3args, &loadAwareSchedulingArgs, nil)
214+
assert.NoError(t, err)
215+
216+
koordClientSet := koordfake.NewSimpleClientset()
217+
koordSharedInformerFactory := koordinatorinformers.NewSharedInformerFactory(koordClientSet, 0)
218+
extenderFactory, _ := frameworkext.NewFrameworkExtenderFactory(
219+
frameworkext.WithKoordinatorClientSet(koordClientSet),
220+
frameworkext.WithKoordinatorSharedInformerFactory(koordSharedInformerFactory),
221+
)
222+
proxyNew := frameworkext.PluginFactoryProxy(extenderFactory, New)
223+
224+
cs := kubefake.NewSimpleClientset()
225+
informerFactory := informers.NewSharedInformerFactory(cs, 0)
226+
227+
nodes := []*corev1.Node{
228+
{
229+
ObjectMeta: metav1.ObjectMeta{
230+
Name: tt.nodeMetric.Name,
231+
},
232+
},
233+
}
234+
235+
snapshot := newTestSharedLister(nil, nodes)
236+
registeredPlugins := []schedulertesting.RegisterPluginFunc{
237+
schedulertesting.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New),
238+
schedulertesting.RegisterQueueSortPlugin(queuesort.Name, queuesort.New),
239+
}
240+
fh, err := schedulertesting.NewFramework(context.TODO(), registeredPlugins, "koord-scheduler",
241+
frameworkruntime.WithClientSet(cs),
242+
frameworkruntime.WithInformerFactory(informerFactory),
243+
frameworkruntime.WithSnapshotSharedLister(snapshot),
244+
)
245+
assert.Nil(t, err)
246+
247+
p, err := proxyNew(&loadAwareSchedulingArgs, fh)
248+
assert.NotNil(t, p)
249+
assert.Nil(t, err)
250+
251+
_, err = koordClientSet.SloV1alpha1().NodeMetrics().Create(context.TODO(), tt.nodeMetric, metav1.CreateOptions{})
252+
assert.NoError(t, err)
253+
254+
koordSharedInformerFactory.Start(context.TODO().Done())
255+
koordSharedInformerFactory.WaitForCacheSync(context.TODO().Done())
256+
257+
cycleState := framework.NewCycleState()
258+
259+
nodeInfo, err := snapshot.Get(tt.nodeMetric.Name)
260+
assert.NoError(t, err)
261+
assert.NotNil(t, nodeInfo)
262+
263+
status := p.(*Plugin).Filter(context.TODO(), cycleState, &corev1.Pod{}, nodeInfo)
264+
assert.True(t, tt.wantStatus.Equal(status), "want status: %s, but got %s", tt.wantStatus.Message(), status.Message())
265+
})
266+
}
267+
}
268+
269+
func TestEnableScheduleWhenNodeMetricsExpired(t *testing.T) {
270+
tests := []struct {
271+
name string
272+
nodeMetric *slov1alpha1.NodeMetric
273+
enableScheduleWhenNodeMetricsExpired *bool
274+
wantStatus *framework.Status
275+
}{
276+
{
277+
name: "filter healthy nodeMetrics",
278+
nodeMetric: &slov1alpha1.NodeMetric{
279+
ObjectMeta: metav1.ObjectMeta{
280+
Name: "test-node-1",
281+
},
282+
Spec: slov1alpha1.NodeMetricSpec{
283+
CollectPolicy: &slov1alpha1.NodeMetricCollectPolicy{
284+
ReportIntervalSeconds: pointer.Int64(60),
285+
},
286+
},
287+
Status: slov1alpha1.NodeMetricStatus{
288+
UpdateTime: &metav1.Time{
289+
Time: time.Now(),
290+
},
291+
},
292+
},
205293
wantStatus: nil,
206294
},
295+
{
296+
name: "enable scheduling when nodeMetric with nil updateTime",
297+
nodeMetric: &slov1alpha1.NodeMetric{
298+
ObjectMeta: metav1.ObjectMeta{
299+
Name: "test-node-1",
300+
},
301+
Spec: slov1alpha1.NodeMetricSpec{
302+
CollectPolicy: &slov1alpha1.NodeMetricCollectPolicy{
303+
ReportIntervalSeconds: pointer.Int64(60),
304+
},
305+
},
306+
},
307+
enableScheduleWhenNodeMetricsExpired: pointer.Bool(true),
308+
wantStatus: nil,
309+
},
310+
{
311+
name: "enable scheduling when nodeMetric with expired updateTime",
312+
nodeMetric: &slov1alpha1.NodeMetric{
313+
ObjectMeta: metav1.ObjectMeta{
314+
Name: "test-node-1",
315+
},
316+
Spec: slov1alpha1.NodeMetricSpec{
317+
CollectPolicy: &slov1alpha1.NodeMetricCollectPolicy{
318+
ReportIntervalSeconds: pointer.Int64(60),
319+
},
320+
},
321+
Status: slov1alpha1.NodeMetricStatus{
322+
UpdateTime: &metav1.Time{
323+
Time: time.Now().Add(-180 * time.Second),
324+
},
325+
},
326+
},
327+
enableScheduleWhenNodeMetricsExpired: pointer.Bool(true),
328+
wantStatus: nil,
329+
},
330+
{
331+
name: "disable scheduling when nodeMetric with nil updateTime",
332+
nodeMetric: &slov1alpha1.NodeMetric{
333+
ObjectMeta: metav1.ObjectMeta{
334+
Name: "test-node-1",
335+
},
336+
Spec: slov1alpha1.NodeMetricSpec{
337+
CollectPolicy: &slov1alpha1.NodeMetricCollectPolicy{
338+
ReportIntervalSeconds: pointer.Int64(60),
339+
},
340+
},
341+
},
342+
enableScheduleWhenNodeMetricsExpired: pointer.Bool(false),
343+
wantStatus: framework.NewStatus(framework.Unschedulable, ErrReasonNodeMetricExpired),
344+
},
345+
{
346+
name: "disable scheduling when nodeMetric with expired updateTime",
347+
nodeMetric: &slov1alpha1.NodeMetric{
348+
ObjectMeta: metav1.ObjectMeta{
349+
Name: "test-node-1",
350+
},
351+
Spec: slov1alpha1.NodeMetricSpec{
352+
CollectPolicy: &slov1alpha1.NodeMetricCollectPolicy{
353+
ReportIntervalSeconds: pointer.Int64(60),
354+
},
355+
},
356+
Status: slov1alpha1.NodeMetricStatus{
357+
UpdateTime: &metav1.Time{
358+
Time: time.Now().Add(-180 * time.Second),
359+
},
360+
},
361+
},
362+
enableScheduleWhenNodeMetricsExpired: pointer.Bool(false),
363+
wantStatus: framework.NewStatus(framework.Unschedulable, ErrReasonNodeMetricExpired),
364+
},
207365
}
208366
for _, tt := range tests {
209367
t.Run(tt.name, func(t *testing.T) {
210368
var v1beta3args v1beta3.LoadAwareSchedulingArgs
211369
v1beta3.SetDefaults_LoadAwareSchedulingArgs(&v1beta3args)
370+
v1beta3args.EnableScheduleWhenNodeMetricsExpired = tt.enableScheduleWhenNodeMetricsExpired
212371
var loadAwareSchedulingArgs config.LoadAwareSchedulingArgs
213372
err := v1beta3.Convert_v1beta3_LoadAwareSchedulingArgs_To_config_LoadAwareSchedulingArgs(&v1beta3args, &loadAwareSchedulingArgs, nil)
214373
assert.NoError(t, err)

0 commit comments

Comments
 (0)