Skip to content

Commit 142a8a9

Browse files
ffromanishajmakh
andcommitted
controller: sched: detect and use cluster properties
The fix has landed in OCP 4.18, make sure this is reflected to consume the new default of schedulerInformer in case it is not set by the user. Previously the version parsing relied on the assumption that the platform version is of type stable (e.i 4.19.9, 4.20.2 ..) but this is incorrect because the cluster may be deployed with different types of builds and each requires its own processing logic, which must know the first supported build for each type. Ensure that the different types are recognized and processed as required. Design notes: moved platforminfo in a separate package, still internal. Previously, it was a private controller helper. Moved into a new packge mostly because didn't want to pollute controllers with version check. Added a Properties sub-struct vs add straight into the PlatformInfo the value. This is debatable. We don't plan to add more properties, so why to bother? Thing is, I can't remember a time on which I added a specific flag or field top-level and never regret later on. At very least we have namespaced properties which is nicer. I acknowledge the tension between not overcomplicate things, not do things "just in case" (there is no future expansion on the radar) but still if we don't namespace things early on, it's harder to retrofit. So went with this approach this time, but again, is debatable. The same line of thought lead to implement the version check in a specific, non-generic way tailor made for this case, exactly because we don't see more usecases coming so generalizing felt overkill. The summarization is thus that the **internal** API was made extensible (even if we don't see extensions happening) because APIs are harder to change, so "just in case" has more merit here. The implementation, on the other hand, is simpler to change and addresses only the problem at hand. PlatformInfo is heavily overused. Should probably be ClusterInfo. Co-authored-by: Shereen Haj <[email protected]> Signed-off-by: Francesco Romani <[email protected]> (cherry picked from commit b4608c3) (cherry picked from commit 94371a5)
1 parent 3f81dcb commit 142a8a9

File tree

7 files changed

+404
-97
lines changed

7 files changed

+404
-97
lines changed

controllers/numaresourcesscheduler_controller.go

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ import (
4444

4545
nropv1 "github.com/openshift-kni/numaresources-operator/api/numaresourcesoperator/v1"
4646
"github.com/openshift-kni/numaresources-operator/internal/api/annotations"
47+
"github.com/openshift-kni/numaresources-operator/internal/platforminfo"
4748
"github.com/openshift-kni/numaresources-operator/internal/relatedobjects"
4849
"github.com/openshift-kni/numaresources-operator/pkg/apply"
4950
"github.com/openshift-kni/numaresources-operator/pkg/hash"
@@ -56,25 +57,14 @@ import (
5657
"github.com/openshift-kni/numaresources-operator/pkg/status"
5758
)
5859

59-
const (
60-
// ActivePodsResourcesSupportSince defines the OCP version which started to support the fixed kubelet
61-
// in which the PodResourcesAPI lists the active pods by default
62-
activePodsResourcesSupportSince = "4.18.23"
63-
)
64-
65-
type PlatformInfo struct {
66-
Platform platform.Platform
67-
Version platform.Version
68-
}
69-
7060
// NUMAResourcesSchedulerReconciler reconciles a NUMAResourcesScheduler object
7161
type NUMAResourcesSchedulerReconciler struct {
7262
client.Client
7363
Scheme *runtime.Scheme
7464
SchedulerManifests schedmanifests.Manifests
7565
Namespace string
7666
AutodetectReplicas int
77-
PlatformInfo PlatformInfo
67+
PlatformInfo platforminfo.PlatformInfo
7868
}
7969

8070
//+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterroles,verbs=*
@@ -271,26 +261,21 @@ func (r *NUMAResourcesSchedulerReconciler) syncNUMASchedulerResources(ctx contex
271261
return schedStatus, nil
272262
}
273263

274-
func platformNormalize(spec *nropv1.NUMAResourcesSchedulerSpec, platInfo PlatformInfo) {
264+
func platformNormalize(spec *nropv1.NUMAResourcesSchedulerSpec, platInfo platforminfo.PlatformInfo) {
275265
if platInfo.Platform != platform.OpenShift && platInfo.Platform != platform.HyperShift {
276266
return
277267
}
278-
279-
parsedVersion, _ := platform.ParseVersion(activePodsResourcesSupportSince)
280-
ok, err := platInfo.Version.AtLeast(parsedVersion)
281-
if err != nil {
282-
klog.Infof("failed to compare version %v with %v, err %v", parsedVersion, platInfo.Version, err)
268+
if spec.SchedulerInformer != nil {
269+
// assume user-provided value. Nothing to do.
270+
klog.V(4).InfoS("SchedulerInformer explicit value", "Platform", platInfo.Platform, "PlatformVersion", platInfo.Version.String(), "SchedulerInformer", *spec.SchedulerInformer)
283271
return
284272
}
285-
286-
if !ok {
273+
if !platInfo.Properties.PodResourcesListFilterActivePods {
274+
// keep shared default for backward compatibility. TODO: review/switch default in 4.21
287275
return
288276
}
289-
290-
if spec.SchedulerInformer == nil {
291-
spec.SchedulerInformer = ptr.To(nropv1.SchedulerInformerShared)
292-
klog.V(4).InfoS("SchedulerInformer default is overridden", "Platform", platInfo.Platform, "PlatformVersion", platInfo.Version.String(), "SchedulerInformer", &spec.SchedulerInformer)
293-
}
277+
spec.SchedulerInformer = ptr.To(nropv1.SchedulerInformerShared)
278+
klog.V(4).InfoS("SchedulerInformer default is overridden", "Platform", platInfo.Platform, "PlatformVersion", platInfo.Version.String(), "SchedulerInformer", *spec.SchedulerInformer)
294279
}
295280

296281
func buildDedicatedInformerCondition(instance nropv1.NUMAResourcesScheduler, normalized nropv1.NUMAResourcesSchedulerSpec) metav1.Condition {

controllers/numaresourcesscheduler_controller_test.go

Lines changed: 24 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ import (
4949
nropv1 "github.com/openshift-kni/numaresources-operator/api/numaresourcesoperator/v1"
5050
"github.com/openshift-kni/numaresources-operator/internal/api/annotations"
5151
testobjs "github.com/openshift-kni/numaresources-operator/internal/objects"
52+
"github.com/openshift-kni/numaresources-operator/internal/platforminfo"
5253
"github.com/openshift-kni/numaresources-operator/pkg/hash"
5354
nrosched "github.com/openshift-kni/numaresources-operator/pkg/numaresourcesscheduler"
5455
schedmanifests "github.com/openshift-kni/numaresources-operator/pkg/numaresourcesscheduler/manifests/sched"
@@ -766,9 +767,12 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() {
766767
var reconciler *NUMAResourcesSchedulerReconciler
767768

768769
ginkgo.When("kubelet fix is enabled", func() {
769-
fixedVersion, _ := platform.ParseVersion(activePodsResourcesSupportSince)
770+
fixedVersion, _ := platform.ParseVersion("4.18.23")
771+
unfixedVersion, _ := platform.ParseVersion("4.18.0") // can't (and we must not even if we can) rewrite history
772+
futureFixedVersionZstream, _ := platform.ParseVersion("4.18.26") // we must never regress
773+
futureFixedVersion, _ := platform.ParseVersion("4.21.0") // we must never regress
770774

771-
ginkgo.DescribeTable("should configure by default the informerMode to the expected when field is not set", func(reconcilerPlatInfo PlatformInfo, expectedInformer string) {
775+
ginkgo.DescribeTable("should configure by default the informerMode to the expected when field is not set", func(reconcilerPlatInfo platforminfo.PlatformInfo, expectedInformer string) {
772776
var err error
773777
nrs = testobjs.NewNUMAResourcesScheduler("numaresourcesscheduler", "some/url:latest", testSchedulerName, 11*time.Second)
774778
reconciler, err = NewFakeNUMAResourcesSchedulerReconciler(nrs)
@@ -793,17 +797,18 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() {
793797
gomega.Expect(c).ToNot(gomega.BeNil())
794798
gomega.Expect(c.Status).To(gomega.Equal(expectedDedicatedActiveStatus))
795799
},
796-
ginkgo.Entry("with fixed Openshift the default informer is Shared", PlatformInfo{
797-
Platform: platform.OpenShift,
798-
Version: fixedVersion,
799-
}, depmanifests.CacheInformerShared),
800-
ginkgo.Entry("with fixed Hypershift the default informer is Shared", PlatformInfo{
801-
Platform: platform.HyperShift,
802-
Version: fixedVersion,
803-
}, depmanifests.CacheInformerShared),
804-
ginkgo.Entry("with unknown platform the default informer is Dedicated (unchanged)", PlatformInfo{}, depmanifests.CacheInformerDedicated))
805-
806-
ginkgo.DescribeTable("should preserve informerMode value if set", func(reconcilerPlatInfo PlatformInfo) {
800+
ginkgo.Entry("with fixed Openshift the default informer is Shared", platforminfo.New(platform.OpenShift, fixedVersion), depmanifests.CacheInformerShared),
801+
ginkgo.Entry("with fixed Hypershift the default informer is Shared", platforminfo.New(platform.HyperShift, fixedVersion), depmanifests.CacheInformerShared),
802+
ginkgo.Entry("with unfixed platform the default informer is Dedicated (unchanged)", platforminfo.New(platform.OpenShift, unfixedVersion), depmanifests.CacheInformerDedicated),
803+
ginkgo.Entry("with unfixed platform the default informer is Dedicated (unchanged)", platforminfo.New(platform.HyperShift, unfixedVersion), depmanifests.CacheInformerDedicated),
804+
ginkgo.Entry("with fixed Openshift the default informer is Shared", platforminfo.New(platform.OpenShift, futureFixedVersion), depmanifests.CacheInformerShared),
805+
ginkgo.Entry("with fixed Hypershift the default informer is Shared", platforminfo.New(platform.HyperShift, futureFixedVersion), depmanifests.CacheInformerShared),
806+
ginkgo.Entry("with fixed Openshift the default informer is Shared", platforminfo.New(platform.OpenShift, futureFixedVersionZstream), depmanifests.CacheInformerShared),
807+
ginkgo.Entry("with fixed Hypershift the default informer is Shared", platforminfo.New(platform.HyperShift, futureFixedVersionZstream), depmanifests.CacheInformerShared),
808+
ginkgo.Entry("with unknown platform the default informer is Dedicated (unchanged)", platforminfo.PlatformInfo{}, depmanifests.CacheInformerDedicated),
809+
)
810+
811+
ginkgo.DescribeTable("should preserve informerMode value if set", func(reconcilerPlatInfo platforminfo.PlatformInfo) {
807812
var err error
808813
nrs = testobjs.NewNUMAResourcesScheduler("numaresourcesscheduler", "some/url:latest", testSchedulerName, 11*time.Second)
809814
infMode := nropv1.SchedulerInformerDedicated
@@ -818,17 +823,17 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() {
818823
gomega.Expect(err).ToNot(gomega.HaveOccurred())
819824
expectCacheParams(reconciler.Client, depmanifests.CacheResyncAutodetect, depmanifests.CacheResyncOnlyExclusiveResources, string(infMode))
820825
},
821-
ginkgo.Entry("with Openshift", PlatformInfo{
826+
ginkgo.Entry("with Openshift", platforminfo.PlatformInfo{
822827
Platform: platform.OpenShift,
823828
Version: fixedVersion,
824829
}),
825-
ginkgo.Entry("with Hypershift", PlatformInfo{
830+
ginkgo.Entry("with Hypershift", platforminfo.PlatformInfo{
826831
Platform: platform.HyperShift,
827832
Version: fixedVersion,
828833
}),
829-
ginkgo.Entry("with unknown platform", PlatformInfo{}))
834+
ginkgo.Entry("with unknown platform", platforminfo.PlatformInfo{}))
830835

831-
ginkgo.DescribeTable("should allow to update the informerMode to be Dedicated after an overridden default", func(reconcilerPlatInfo PlatformInfo) {
836+
ginkgo.DescribeTable("should allow to update the informerMode to be Dedicated after an overridden default", func(reconcilerPlatInfo platforminfo.PlatformInfo) {
832837
var err error
833838
nrs = testobjs.NewNUMAResourcesScheduler("numaresourcesscheduler", "some/url:latest", testSchedulerName, 11*time.Second)
834839
reconciler, err = NewFakeNUMAResourcesSchedulerReconciler(nrs)
@@ -861,67 +866,18 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() {
861866

862867
expectCacheParams(reconciler.Client, depmanifests.CacheResyncAutodetect, depmanifests.CacheResyncOnlyExclusiveResources, string(informerMode))
863868
},
864-
ginkgo.Entry("with Openshift", PlatformInfo{
869+
ginkgo.Entry("with Openshift", platforminfo.PlatformInfo{
865870
Platform: platform.OpenShift,
866871
Version: fixedVersion,
867872
}),
868-
ginkgo.Entry("with Hypershift", PlatformInfo{
873+
ginkgo.Entry("with Hypershift", platforminfo.PlatformInfo{
869874
Platform: platform.HyperShift,
870875
Version: fixedVersion,
871876
}))
872877
})
873878
})
874879
})
875880

876-
var _ = ginkgo.Describe("Test scheduler spec platformNormalize", func() {
877-
ginkgo.When("Spec.SchedulerInformer is not set by the user", func() {
878-
ginkgo.It("should override default informer to Shared if kubelet is fixed - first supported zstream version", func() {
879-
v, _ := platform.ParseVersion(activePodsResourcesSupportSince)
880-
spec := nropv1.NUMAResourcesSchedulerSpec{}
881-
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
882-
gomega.Expect(*spec.SchedulerInformer).To(gomega.Equal(nropv1.SchedulerInformerShared))
883-
})
884-
885-
ginkgo.It("should override default informer to Shared if kubelet is fixed - version is greater than first supported (zstream)", func() {
886-
v, _ := platform.ParseVersion("4.18.24")
887-
spec := nropv1.NUMAResourcesSchedulerSpec{}
888-
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
889-
gomega.Expect(*spec.SchedulerInformer).To(gomega.Equal(nropv1.SchedulerInformerShared))
890-
})
891-
892-
ginkgo.It("should override default informer to Shared if kubelet is fixed - version is greater than first supported (ystream)", func() {
893-
v, _ := platform.ParseVersion("4.20.0")
894-
spec := nropv1.NUMAResourcesSchedulerSpec{}
895-
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
896-
gomega.Expect(*spec.SchedulerInformer).To(gomega.Equal(nropv1.SchedulerInformerShared))
897-
})
898-
899-
ginkgo.It("should not override default informer if kubelet is not fixed - version is less than first supported (zstream)", func() {
900-
v, _ := platform.ParseVersion("4.18.8")
901-
spec := nropv1.NUMAResourcesSchedulerSpec{}
902-
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
903-
gomega.Expect(spec.SchedulerInformer).To(gomega.BeNil())
904-
})
905-
906-
ginkgo.It("should not override default informer if kubelet is not fixed - version is less than first supported (ystream)", func() {
907-
v, _ := platform.ParseVersion("4.13.0")
908-
spec := nropv1.NUMAResourcesSchedulerSpec{}
909-
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
910-
gomega.Expect(spec.SchedulerInformer).To(gomega.BeNil())
911-
})
912-
})
913-
ginkgo.When("Spec.SchedulerInformer is set by the user", func() {
914-
ginkgo.It("should preserve informer value set by the user even if kubelet is fixed", func() {
915-
v, _ := platform.ParseVersion(activePodsResourcesSupportSince)
916-
spec := nropv1.NUMAResourcesSchedulerSpec{
917-
SchedulerInformer: ptr.To(nropv1.SchedulerInformerDedicated),
918-
}
919-
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
920-
gomega.Expect(*spec.SchedulerInformer).To(gomega.Equal(nropv1.SchedulerInformerDedicated))
921-
})
922-
})
923-
})
924-
925881
func HaveTheSameResourceRequirements(expectedRR corev1.ResourceRequirements) types.GomegaMatcher {
926882
return gcustom.MakeMatcher(func(actual *appsv1.Deployment) (bool, error) {
927883
cntName := schedupdate.MainContainerName // shortcut

internal/platforminfo/platforminfo.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Copyright 2025 Red Hat, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package platforminfo
18+
19+
import (
20+
"github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform"
21+
)
22+
23+
// PlatformProperties represents the platform capabilities we have to infer and which are not explicitly
24+
// advertised from the platform using standard APIs.
25+
type PlatformProperties struct {
26+
PodResourcesListFilterActivePods bool
27+
}
28+
29+
type PlatformInfo struct {
30+
Platform platform.Platform
31+
Version platform.Version
32+
Properties PlatformProperties
33+
}
34+
35+
func New(plat platform.Platform, ver platform.Version) PlatformInfo {
36+
info := PlatformInfo{
37+
Platform: plat,
38+
Version: ver,
39+
}
40+
discoverProperties(&info)
41+
return info
42+
}
43+
44+
func discoverProperties(info *PlatformInfo) {
45+
info.Properties.PodResourcesListFilterActivePods = isVersionEnoughForPodresourcesListFilterActivePods(info.Platform, info.Version)
46+
}
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
/*
2+
* Copyright 2025 Red Hat, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package platforminfo
18+
19+
import (
20+
"reflect"
21+
"testing"
22+
23+
"github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform"
24+
)
25+
26+
func TestNewWithDiscover(t *testing.T) {
27+
testcases := []struct {
28+
description string
29+
plat platform.Platform
30+
ver platform.Version
31+
expected PlatformProperties
32+
}{
33+
{
34+
description: "empty",
35+
},
36+
{
37+
description: "last major unfixed",
38+
plat: platform.OpenShift,
39+
ver: mustParseVersion("4.18.0"),
40+
expected: PlatformProperties{
41+
PodResourcesListFilterActivePods: false,
42+
},
43+
},
44+
{
45+
description: "first major fixed",
46+
plat: platform.OpenShift,
47+
ver: mustParseVersion("4.20.0"), // at time of writing
48+
expected: PlatformProperties{
49+
PodResourcesListFilterActivePods: true,
50+
},
51+
},
52+
{
53+
description: "next Z-stream fixed, must never regress", // at time of writing
54+
plat: platform.OpenShift,
55+
ver: mustParseVersion("4.19.11"),
56+
expected: PlatformProperties{
57+
PodResourcesListFilterActivePods: true,
58+
},
59+
},
60+
{
61+
description: "next major fixed, must never regress", // at time of writing
62+
plat: platform.OpenShift,
63+
ver: mustParseVersion("4.21.0"),
64+
expected: PlatformProperties{
65+
PodResourcesListFilterActivePods: true,
66+
},
67+
},
68+
}
69+
for _, tc := range testcases {
70+
t.Run(tc.description, func(t *testing.T) {
71+
got := New(tc.plat, tc.ver)
72+
if !reflect.DeepEqual(got.Properties, tc.expected) {
73+
t.Errorf("expected %#v got %#v", tc.expected, got.Properties)
74+
}
75+
})
76+
}
77+
}
78+
79+
func mustParseVersion(ver string) platform.Version {
80+
v, err := platform.ParseVersion(ver)
81+
if err != nil {
82+
panic(err)
83+
}
84+
return v
85+
}

0 commit comments

Comments
 (0)