Skip to content

Commit 6a56840

Browse files
committed
controller: sched: conditionally enable Shared informer by default
We are expecting c6c23c50f8dbdb863acb260f82f749262572abd3 to land to openshift soon, once it lands it will be enabled by default and we want the scheduler to adapt to this behavior by default iff the user didn't explicitly set the informer mode in the CR. Until now, the scheduler works in a dedicated informer mode which is meant to take into account the pods in terminal state (such as pods that ran and completed) in the PFP computation to ensure it matches the PFP computed by the RTE and reported in NRT. The intended behavior which the new kubelet behavior is about ignoring such pods and accounting only for active pods, so if this behavior is enabled in the RTE-NRT, while kept the default dedicated in the scheduler there will be misalignment in the computed vs the expected PFP from scheduler's POV vs NRT's POV and a scheduler stall will happen that will never recover. In this commit we adjust the informer default value to Shared (instead of Dedicated) only if both below conditions are met: 1. the cluster version supports the fixed kubelet which is met if the cluster version is equal or greater to the known-to-be fixed OCP version. 2. the user didn't set the Spec.SchedulerInformer field in the NRS CR This modification will enable the shared mode which in turn includes only running pods (= active pods) in the PFP computation from POV allowing ultimately PFP alignment with NRT. Signed-off-by: Shereen Haj <[email protected]>
1 parent cf6b010 commit 6a56840

File tree

4 files changed

+200
-1
lines changed

4 files changed

+200
-1
lines changed

api/v1/numaresourcesscheduler_types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ const (
3838
type SchedulerInformerMode string
3939

4040
const (
41-
// SchedulerInformerDedicated makes the NodeResourceTopologyMatch plugin use the default framework informer.
41+
// SchedulerInformerShared makes the NodeResourceTopologyMatch plugin use the default framework informer.
4242
SchedulerInformerShared SchedulerInformerMode = "Shared"
4343

4444
// SchedulerInformerDedicated sets an additional separate informer just for the NodeResourceTopologyMatch plugin. Default.

cmd/main.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,10 @@ func main() {
338338
Scheme: mgr.GetScheme(),
339339
SchedulerManifests: schedMf,
340340
Namespace: namespace,
341+
PlatformInfo: controller.PlatformInfo{
342+
Platform: clusterPlatform,
343+
Version: clusterPlatformVersion,
344+
},
341345
}).SetupWithManager(mgr); err != nil {
342346
klog.ErrorS(err, "unable to create controller", "controller", "NUMAResourcesScheduler")
343347
os.Exit(1)

internal/controller/numaresourcesscheduler_controller.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import (
4141
"sigs.k8s.io/controller-runtime/pkg/predicate"
4242
"sigs.k8s.io/controller-runtime/pkg/reconcile"
4343

44+
"github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform"
4445
k8swgmanifests "github.com/k8stopologyawareschedwg/deployer/pkg/manifests"
4546
k8swgrbacupdate "github.com/k8stopologyawareschedwg/deployer/pkg/objectupdate/rbac"
4647

@@ -58,12 +59,24 @@ import (
5859
"github.com/openshift-kni/numaresources-operator/pkg/status"
5960
)
6061

62+
const (
63+
// ActivePodsResourcesSupportSince defines the OCP version which started to support the fixed kubelet
64+
// in which the PodResourcesAPI lists the active pods by default
65+
activePodsResourcesSupportSince = "4.20.999"
66+
)
67+
68+
type PlatformInfo struct {
69+
Platform platform.Platform
70+
Version platform.Version
71+
}
72+
6173
// NUMAResourcesSchedulerReconciler reconciles a NUMAResourcesScheduler object
6274
type NUMAResourcesSchedulerReconciler struct {
6375
client.Client
6476
Scheme *runtime.Scheme
6577
SchedulerManifests schedmanifests.Manifests
6678
Namespace string
79+
PlatformInfo PlatformInfo
6780
}
6881

6982
//+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterroles,verbs=*
@@ -225,6 +238,8 @@ func (r *NUMAResourcesSchedulerReconciler) syncNUMASchedulerResources(ctx contex
225238
klog.V(4).Info("SchedulerSync start")
226239
defer klog.V(4).Info("SchedulerSync stop")
227240

241+
platformNormalize(&instance.Spec, r.PlatformInfo)
242+
228243
schedSpec := instance.Spec.Normalize()
229244
cacheResyncPeriod := unpackAPIResyncPeriod(schedSpec.CacheResyncPeriod)
230245
replicas, err := r.computeSchedulerReplicas(ctx, schedSpec)
@@ -293,6 +308,27 @@ func (r *NUMAResourcesSchedulerReconciler) syncNUMASchedulerResources(ctx contex
293308
return schedStatus, nil
294309
}
295310

311+
func platformNormalize(spec *nropv1.NUMAResourcesSchedulerSpec, platInfo PlatformInfo) {
312+
if platInfo.Platform != platform.OpenShift && platInfo.Platform != platform.HyperShift {
313+
return
314+
}
315+
316+
parsedVersion, _ := platform.ParseVersion(activePodsResourcesSupportSince)
317+
ok, err := platInfo.Version.AtLeast(parsedVersion)
318+
if err != nil {
319+
klog.Infof("failed to compare version %v with %v, err %v", parsedVersion, platInfo.Version, err)
320+
return
321+
}
322+
323+
if !ok {
324+
return
325+
}
326+
327+
if spec.SchedulerInformer == nil {
328+
spec.SchedulerInformer = ptr.To(nropv1.SchedulerInformerShared)
329+
klog.V(4).InfoS("SchedulerInformer default is overridden", "Platform", platInfo.Platform, "PlatformVersion", platInfo.Version.String(), "SchedulerInformer", &spec.SchedulerInformer)
330+
}
331+
}
296332
func (r *NUMAResourcesSchedulerReconciler) updateStatus(ctx context.Context, sched *nropv1.NUMAResourcesScheduler, condition string, reason string, message string) error {
297333
sched.Status.Conditions, _ = status.UpdateConditions(sched.Status.Conditions, condition, reason, message)
298334
if err := r.Client.Status().Update(ctx, sched); err != nil {

internal/controller/numaresourcesscheduler_controller_test.go

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import (
4040

4141
configv1 "github.com/openshift/api/config/v1"
4242

43+
"github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform"
4344
depmanifests "github.com/k8stopologyawareschedwg/deployer/pkg/manifests"
4445
depobjupdate "github.com/k8stopologyawareschedwg/deployer/pkg/objectupdate"
4546

@@ -632,6 +633,164 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() {
632633
gomega.Expect(*dp.Spec.Replicas).To(gomega.Equal(int32(numOfMasters)), "number of replicas is different than number of control-planes nodes; want=%d got=%d", numOfMasters, *dp.Spec.Replicas)
633634
})
634635
})
636+
637+
ginkgo.Context("with kubelet PodResourcesAPI listing active pods by default", func() {
638+
var nrs *nropv1.NUMAResourcesScheduler
639+
var reconciler *NUMAResourcesSchedulerReconciler
640+
numOfMasters := 3
641+
642+
ginkgo.When("kubelet fix is enabled", func() {
643+
fixedVersion, _ := platform.ParseVersion(activePodsResourcesSupportSince)
644+
645+
ginkgo.DescribeTable("should configure by default the informerMode to the expected when field is not set", func(reconcilerPlatInfo PlatformInfo, expectedInformer string) {
646+
var err error
647+
nrs = testobjs.NewNUMAResourcesScheduler("numaresourcesscheduler", "some/url:latest", testSchedulerName, 11*time.Second)
648+
initObjects := []runtime.Object{nrs}
649+
initObjects = append(initObjects, fakeNodes(numOfMasters, 3)...)
650+
reconciler, err = NewFakeNUMAResourcesSchedulerReconciler(initObjects...)
651+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
652+
653+
reconciler.PlatformInfo = reconcilerPlatInfo
654+
655+
key := client.ObjectKeyFromObject(nrs)
656+
_, err = reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key})
657+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
658+
659+
expectCacheParams(reconciler.Client, depmanifests.CacheResyncAutodetect, depmanifests.CacheResyncOnlyExclusiveResources, expectedInformer)
660+
},
661+
ginkgo.Entry("with fixed Openshift the default informer is Shared", PlatformInfo{
662+
Platform: platform.OpenShift,
663+
Version: fixedVersion,
664+
}, depmanifests.CacheInformerShared),
665+
ginkgo.Entry("with fixed Hypershift the default informer is Shared", PlatformInfo{
666+
Platform: platform.HyperShift,
667+
Version: fixedVersion,
668+
}, depmanifests.CacheInformerShared),
669+
ginkgo.Entry("with unknown platform the default informer is Dedicated (unchanged)", PlatformInfo{}, depmanifests.CacheInformerDedicated))
670+
671+
ginkgo.DescribeTable("should preserve informerMode value if set", func(reconcilerPlatInfo PlatformInfo) {
672+
var err error
673+
nrs = testobjs.NewNUMAResourcesScheduler("numaresourcesscheduler", "some/url:latest", testSchedulerName, 11*time.Second)
674+
infMode := nropv1.SchedulerInformerDedicated
675+
nrs.Spec.SchedulerInformer = &infMode
676+
initObjects := []runtime.Object{nrs}
677+
initObjects = append(initObjects, fakeNodes(numOfMasters, 3)...)
678+
reconciler, err = NewFakeNUMAResourcesSchedulerReconciler(initObjects...)
679+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
680+
681+
reconciler.PlatformInfo = reconcilerPlatInfo
682+
683+
key := client.ObjectKeyFromObject(nrs)
684+
_, err = reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key})
685+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
686+
expectCacheParams(reconciler.Client, depmanifests.CacheResyncAutodetect, depmanifests.CacheResyncOnlyExclusiveResources, string(infMode))
687+
},
688+
ginkgo.Entry("with Openshift", PlatformInfo{
689+
Platform: platform.OpenShift,
690+
Version: fixedVersion,
691+
}),
692+
ginkgo.Entry("with Hypershift", PlatformInfo{
693+
Platform: platform.HyperShift,
694+
Version: fixedVersion,
695+
}),
696+
ginkgo.Entry("with unknown platform", PlatformInfo{}))
697+
698+
ginkgo.DescribeTable("should allow to update the informerMode to be Dedicated after an overridden default", func(reconcilerPlatInfo PlatformInfo) {
699+
var err error
700+
nrs = testobjs.NewNUMAResourcesScheduler("numaresourcesscheduler", "some/url:latest", testSchedulerName, 11*time.Second)
701+
initObjects := []runtime.Object{nrs}
702+
initObjects = append(initObjects, fakeNodes(numOfMasters, 3)...)
703+
reconciler, err = NewFakeNUMAResourcesSchedulerReconciler(initObjects...)
704+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
705+
706+
reconciler.PlatformInfo = reconcilerPlatInfo
707+
708+
key := client.ObjectKeyFromObject(nrs)
709+
_, err = reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key})
710+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
711+
712+
// intentionally skip checking default value
713+
714+
// should query the object after reconcile because the defaults are overridden
715+
gomega.Expect(reconciler.Client.Get(context.TODO(), key, nrs)).ToNot(gomega.HaveOccurred())
716+
717+
nrsUpdated := nrs.DeepCopy()
718+
informerMode := nropv1.SchedulerInformerDedicated
719+
nrsUpdated.Spec.SchedulerInformer = &informerMode
720+
gomega.Eventually(func() bool {
721+
if err := reconciler.Client.Update(context.TODO(), nrsUpdated); err != nil {
722+
klog.Warningf("failed to update the scheduler object; err: %v", err)
723+
return false
724+
}
725+
return true
726+
}, 30*time.Second, 5*time.Second).Should(gomega.BeTrue())
727+
728+
_, err = reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key})
729+
gomega.Expect(err).ToNot(gomega.HaveOccurred())
730+
731+
expectCacheParams(reconciler.Client, depmanifests.CacheResyncAutodetect, depmanifests.CacheResyncOnlyExclusiveResources, string(informerMode))
732+
},
733+
ginkgo.Entry("with Openshift", PlatformInfo{
734+
Platform: platform.OpenShift,
735+
Version: fixedVersion,
736+
}),
737+
ginkgo.Entry("with Hypershift", PlatformInfo{
738+
Platform: platform.HyperShift,
739+
Version: fixedVersion,
740+
}))
741+
})
742+
})
743+
})
744+
745+
var _ = ginkgo.Describe("Test scheduler spec PreNormalize", func() {
746+
ginkgo.When("Spec.SchedulerInformer is not set by the user", func() {
747+
ginkgo.It("should override default informer to Shared if kubelet is fixed - first supported zstream version", func() {
748+
v, _ := platform.ParseVersion(activePodsResourcesSupportSince)
749+
spec := nropv1.NUMAResourcesSchedulerSpec{}
750+
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
751+
gomega.Expect(*spec.SchedulerInformer).To(gomega.Equal(nropv1.SchedulerInformerShared))
752+
})
753+
754+
ginkgo.It("should override default informer to Shared if kubelet is fixed - version is greater than first supported (zstream)", func() {
755+
v, _ := platform.ParseVersion("4.20.1000")
756+
spec := nropv1.NUMAResourcesSchedulerSpec{}
757+
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
758+
gomega.Expect(*spec.SchedulerInformer).To(gomega.Equal(nropv1.SchedulerInformerShared))
759+
})
760+
761+
ginkgo.It("should override default informer to Shared if kubelet is fixed - version is greater than first supported (ystream)", func() {
762+
v, _ := platform.ParseVersion("4.21.0")
763+
spec := nropv1.NUMAResourcesSchedulerSpec{}
764+
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
765+
gomega.Expect(*spec.SchedulerInformer).To(gomega.Equal(nropv1.SchedulerInformerShared))
766+
})
767+
768+
ginkgo.It("should not override default informer if kubelet is not fixed - version is less than first supported (zstream)", func() {
769+
// this is only for testing purposes as there is plan to backport the fix to older minor versions
770+
// will need to remove this test if the fix is supported starting the first zstream of the release
771+
v, _ := platform.ParseVersion("4.20.0")
772+
spec := nropv1.NUMAResourcesSchedulerSpec{}
773+
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
774+
gomega.Expect(spec.SchedulerInformer).To(gomega.BeNil())
775+
})
776+
777+
ginkgo.It("should not override default informer if kubelet is not fixed - version is less than first supported (ystream)", func() {
778+
v, _ := platform.ParseVersion("4.13.0")
779+
spec := nropv1.NUMAResourcesSchedulerSpec{}
780+
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
781+
gomega.Expect(spec.SchedulerInformer).To(gomega.BeNil())
782+
})
783+
})
784+
ginkgo.When("Spec.SchedulerInformer is set by the user", func() {
785+
ginkgo.It("should preserve informer value set by the user even if kubelet is fixed", func() {
786+
v, _ := platform.ParseVersion(activePodsResourcesSupportSince)
787+
spec := nropv1.NUMAResourcesSchedulerSpec{
788+
SchedulerInformer: ptr.To(nropv1.SchedulerInformerDedicated),
789+
}
790+
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
791+
gomega.Expect(*spec.SchedulerInformer).To(gomega.Equal(nropv1.SchedulerInformerDedicated))
792+
})
793+
})
635794
})
636795

637796
func pop(m map[string]string, k string) string {

0 commit comments

Comments
 (0)