Skip to content

Commit b4608c3

Browse files
ffromanishajmakh
andcommitted
controller: sched: detect and use cluster properties
The fix has landed in OCP nightly 4.20, make sure this is reflected to consume the new default of schedulerInformer in case it is not set by the user. Previously the version parsing relied on the assumption that the platform version is of type stable (e.i 4.20.1, 4.20.2 ..) but this is in correct because the cluster may be deployed with different types of builds and each requires its own processing logic, which must know the first supported build for each type. Ensure that the different types are recognized and processed as required. Design notes: moved platforminfo in a separate package, still internal. Previously, it was a private controller helper. Moved into a new packge mostly because didn't want to pollute controllers with version check. Added a Properties sub-struct vs add straight into the PlatformInfo the value. This is debatable. We don't plan to add more properties, so why to bother? Thing is, I can't remember a time on which I added a specific flag or field top-level and never regret later on. At very least we have namespaced properties which is nicer. I acknowledge the tension between not overcomplicate things, not do things "just in case" (there is no future expansion on the radar) but still if we don't namespace things early on, it's harder to retrofit. So went with this approach this time, but again, is debatable. The same line of thought lead to implement the version check in a specific, non-generic way tailor made for this case, exactly because we don't see more usecases coming so generalizing felt overkill. The summarization is thus that the **internal** API was made extensible (even if we don't see extensions happening) because APIs are harder to change, so "just in case" has more merit here. The implementation, on the other hand, is simpler to change and addresses only the problem at hand. PlatformInfo is heavily overused. Should probably be ClusterInfo. Co-authored-by: Shereen Haj <[email protected]> Signed-off-by: Francesco Romani <[email protected]>
1 parent c210293 commit b4608c3

File tree

7 files changed

+482
-101
lines changed

7 files changed

+482
-101
lines changed

cmd/main.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ import (
6060
"github.com/openshift-kni/numaresources-operator/internal/api/features"
6161
"github.com/openshift-kni/numaresources-operator/internal/controller"
6262
intkloglevel "github.com/openshift-kni/numaresources-operator/internal/kloglevel"
63+
"github.com/openshift-kni/numaresources-operator/internal/platforminfo"
6364
"github.com/openshift-kni/numaresources-operator/pkg/hash"
6465
"github.com/openshift-kni/numaresources-operator/pkg/images"
6566
rtemetricsmanifests "github.com/openshift-kni/numaresources-operator/pkg/metrics/manifests/monitor"
@@ -338,10 +339,7 @@ func main() {
338339
Scheme: mgr.GetScheme(),
339340
SchedulerManifests: schedMf,
340341
Namespace: namespace,
341-
PlatformInfo: controller.PlatformInfo{
342-
Platform: clusterPlatform,
343-
Version: clusterPlatformVersion,
344-
},
342+
PlatformInfo: platforminfo.New(clusterPlatform, clusterPlatformVersion),
345343
}).SetupWithManager(mgr); err != nil {
346344
klog.ErrorS(err, "unable to create controller", "controller", "NUMAResourcesScheduler")
347345
os.Exit(1)

internal/controller/numaresourcesscheduler_controller.go

Lines changed: 10 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ import (
4747

4848
nropv1 "github.com/openshift-kni/numaresources-operator/api/v1"
4949
"github.com/openshift-kni/numaresources-operator/internal/api/annotations"
50+
"github.com/openshift-kni/numaresources-operator/internal/platforminfo"
5051
"github.com/openshift-kni/numaresources-operator/internal/relatedobjects"
5152
"github.com/openshift-kni/numaresources-operator/pkg/apply"
5253
"github.com/openshift-kni/numaresources-operator/pkg/hash"
@@ -60,26 +61,17 @@ import (
6061
)
6162

6263
const (
63-
// ActivePodsResourcesSupportSince defines the OCP version which started to support the fixed kubelet
64-
// in which the PodResourcesAPI lists the active pods by default
65-
activePodsResourcesSupportSince = "4.20.999"
66-
6764
// MaxSchedulerReplicas is the maximum number of scheduler replicas that can be deployed
6865
MaxSchedulerReplicas = 3
6966
)
7067

71-
type PlatformInfo struct {
72-
Platform platform.Platform
73-
Version platform.Version
74-
}
75-
7668
// NUMAResourcesSchedulerReconciler reconciles a NUMAResourcesScheduler object
7769
type NUMAResourcesSchedulerReconciler struct {
7870
client.Client
7971
Scheme *runtime.Scheme
8072
SchedulerManifests schedmanifests.Manifests
8173
Namespace string
82-
PlatformInfo PlatformInfo
74+
PlatformInfo platforminfo.PlatformInfo
8375
}
8476

8577
// Namespace Scoped
@@ -359,26 +351,21 @@ func (r *NUMAResourcesSchedulerReconciler) syncNUMASchedulerResources(ctx contex
359351
return schedStatus, nil
360352
}
361353

362-
func platformNormalize(spec *nropv1.NUMAResourcesSchedulerSpec, platInfo PlatformInfo) {
354+
func platformNormalize(spec *nropv1.NUMAResourcesSchedulerSpec, platInfo platforminfo.PlatformInfo) {
363355
if platInfo.Platform != platform.OpenShift && platInfo.Platform != platform.HyperShift {
364356
return
365357
}
366-
367-
parsedVersion, _ := platform.ParseVersion(activePodsResourcesSupportSince)
368-
ok, err := platInfo.Version.AtLeast(parsedVersion)
369-
if err != nil {
370-
klog.Infof("failed to compare version %v with %v, err %v", parsedVersion, platInfo.Version, err)
358+
if spec.SchedulerInformer != nil {
359+
// assume user-provided value. Nothing to do.
360+
klog.V(4).InfoS("SchedulerInformer explicit value", "Platform", platInfo.Platform, "PlatformVersion", platInfo.Version.String(), "SchedulerInformer", *spec.SchedulerInformer)
371361
return
372362
}
373-
374-
if !ok {
363+
if !platInfo.Properties.PodResourcesListFilterActivePods {
364+
// keep shared default for backward compatibility. TODO: review/switch default in 4.21
375365
return
376366
}
377-
378-
if spec.SchedulerInformer == nil {
379-
spec.SchedulerInformer = ptr.To(nropv1.SchedulerInformerShared)
380-
klog.V(4).InfoS("SchedulerInformer default is overridden", "Platform", platInfo.Platform, "PlatformVersion", platInfo.Version.String(), "SchedulerInformer", &spec.SchedulerInformer)
381-
}
367+
spec.SchedulerInformer = ptr.To(nropv1.SchedulerInformerShared)
368+
klog.V(4).InfoS("SchedulerInformer default is overridden", "Platform", platInfo.Platform, "PlatformVersion", platInfo.Version.String(), "SchedulerInformer", *spec.SchedulerInformer)
382369
}
383370

384371
func buildDedicatedInformerCondition(instance nropv1.NUMAResourcesScheduler, normalized nropv1.NUMAResourcesSchedulerSpec) metav1.Condition {

internal/controller/numaresourcesscheduler_controller_test.go

Lines changed: 28 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ import (
4949
nropv1 "github.com/openshift-kni/numaresources-operator/api/v1"
5050
"github.com/openshift-kni/numaresources-operator/internal/api/annotations"
5151
testobjs "github.com/openshift-kni/numaresources-operator/internal/objects"
52+
"github.com/openshift-kni/numaresources-operator/internal/platforminfo"
5253
"github.com/openshift-kni/numaresources-operator/pkg/hash"
5354
nrosched "github.com/openshift-kni/numaresources-operator/pkg/numaresourcesscheduler"
5455
schedmanifests "github.com/openshift-kni/numaresources-operator/pkg/numaresourcesscheduler/manifests/sched"
@@ -847,9 +848,12 @@ var _ = Describe("Test NUMAResourcesScheduler Reconcile", func() {
847848
numOfMasters := 3
848849

849850
When("kubelet fix is enabled", func() {
850-
fixedVersion, _ := platform.ParseVersion(activePodsResourcesSupportSince)
851+
fixedVersion, _ := platform.ParseVersion("4.20.0")
852+
unfixedVersion, _ := platform.ParseVersion("4.19.0") // can't (and we must not even if we can) rewrite history
853+
futureFixedVersionZstream, _ := platform.ParseVersion("4.20.1") // we must never regress
854+
futureFixedVersion, _ := platform.ParseVersion("4.21.0") // we must never regress
851855

852-
DescribeTable("should configure by default the informerMode to the expected when field is not set", func(reconcilerPlatInfo PlatformInfo, expectedInformer string) {
856+
DescribeTable("should configure by default the informerMode to the expected when field is not set", func(reconcilerPlatInfo platforminfo.PlatformInfo, expectedInformer string) {
853857
var err error
854858
nrs = testobjs.NewNUMAResourcesScheduler("numaresourcesscheduler", "some/url:latest", testSchedulerName, 11*time.Second)
855859
initObjects := []runtime.Object{nrs}
@@ -876,17 +880,18 @@ var _ = Describe("Test NUMAResourcesScheduler Reconcile", func() {
876880
Expect(c).ToNot(BeNil())
877881
Expect(c.Status).To(Equal(expectedDedicatedActiveStatus))
878882
},
879-
Entry("with fixed Openshift the default informer is Shared", PlatformInfo{
880-
Platform: platform.OpenShift,
881-
Version: fixedVersion,
882-
}, depmanifests.CacheInformerShared),
883-
Entry("with fixed Hypershift the default informer is Shared", PlatformInfo{
884-
Platform: platform.HyperShift,
885-
Version: fixedVersion,
886-
}, depmanifests.CacheInformerShared),
887-
Entry("with unknown platform the default informer is Dedicated (unchanged)", PlatformInfo{}, depmanifests.CacheInformerDedicated))
888-
889-
DescribeTable("should preserve informerMode value if set", func(reconcilerPlatInfo PlatformInfo) {
883+
Entry("with fixed Openshift the default informer is Shared", platforminfo.New(platform.OpenShift, fixedVersion), depmanifests.CacheInformerShared),
884+
Entry("with fixed Hypershift the default informer is Shared", platforminfo.New(platform.HyperShift, fixedVersion), depmanifests.CacheInformerShared),
885+
Entry("with unfixed platform the default informer is Dedicated (unchanged)", platforminfo.New(platform.OpenShift, unfixedVersion), depmanifests.CacheInformerDedicated),
886+
Entry("with unfixed platform the default informer is Dedicated (unchanged)", platforminfo.New(platform.HyperShift, unfixedVersion), depmanifests.CacheInformerDedicated),
887+
Entry("with fixed Openshift the default informer is Shared", platforminfo.New(platform.OpenShift, futureFixedVersion), depmanifests.CacheInformerShared),
888+
Entry("with fixed Hypershift the default informer is Shared", platforminfo.New(platform.HyperShift, futureFixedVersion), depmanifests.CacheInformerShared),
889+
Entry("with fixed Openshift the default informer is Shared", platforminfo.New(platform.OpenShift, futureFixedVersionZstream), depmanifests.CacheInformerShared),
890+
Entry("with fixed Hypershift the default informer is Shared", platforminfo.New(platform.HyperShift, futureFixedVersionZstream), depmanifests.CacheInformerShared),
891+
Entry("with unknown platform the default informer is Dedicated (unchanged)", platforminfo.PlatformInfo{}, depmanifests.CacheInformerDedicated),
892+
)
893+
894+
DescribeTable("should preserve informerMode value if set", func(reconcilerPlatInfo platforminfo.PlatformInfo) {
890895
var err error
891896
nrs = testobjs.NewNUMAResourcesScheduler("numaresourcesscheduler", "some/url:latest", testSchedulerName, 11*time.Second)
892897
infMode := nropv1.SchedulerInformerDedicated
@@ -903,17 +908,17 @@ var _ = Describe("Test NUMAResourcesScheduler Reconcile", func() {
903908
Expect(err).ToNot(HaveOccurred())
904909
expectCacheParams(reconciler.Client, depmanifests.CacheResyncAutodetect, depmanifests.CacheResyncOnlyExclusiveResources, string(infMode))
905910
},
906-
Entry("with Openshift", PlatformInfo{
911+
Entry("with Openshift", platforminfo.PlatformInfo{
907912
Platform: platform.OpenShift,
908913
Version: fixedVersion,
909914
}),
910-
Entry("with Hypershift", PlatformInfo{
915+
Entry("with Hypershift", platforminfo.PlatformInfo{
911916
Platform: platform.HyperShift,
912917
Version: fixedVersion,
913918
}),
914-
Entry("with unknown platform", PlatformInfo{}))
919+
Entry("with unknown platform", platforminfo.PlatformInfo{}))
915920

916-
DescribeTable("should allow to update the informerMode to be Dedicated after an overridden default", func(reconcilerPlatInfo PlatformInfo) {
921+
DescribeTable("should allow to update the informerMode to be Dedicated after an overridden default", func(reconcilerPlatInfo platforminfo.PlatformInfo) {
917922
var err error
918923
nrs = testobjs.NewNUMAResourcesScheduler("numaresourcesscheduler", "some/url:latest", testSchedulerName, 11*time.Second)
919924
initObjects := []runtime.Object{nrs}
@@ -948,11 +953,11 @@ var _ = Describe("Test NUMAResourcesScheduler Reconcile", func() {
948953

949954
expectCacheParams(reconciler.Client, depmanifests.CacheResyncAutodetect, depmanifests.CacheResyncOnlyExclusiveResources, string(informerMode))
950955
},
951-
Entry("with Openshift", PlatformInfo{
956+
Entry("with Openshift", platforminfo.PlatformInfo{
952957
Platform: platform.OpenShift,
953958
Version: fixedVersion,
954959
}),
955-
Entry("with Hypershift", PlatformInfo{
960+
Entry("with Hypershift", platforminfo.PlatformInfo{
956961
Platform: platform.HyperShift,
957962
Version: fixedVersion,
958963
}))
@@ -972,7 +977,7 @@ var _ = Describe("Test computeSchedulerReplicas", func() {
972977
DescribeTable("should compute replicas correctly for different platforms and node counts",
973978
func(platform platform.Platform, numControlPlane, numWorker int, expectedReplicas *int32, expectError bool) {
974979
// setup reconciler with platform info
975-
reconciler.PlatformInfo = PlatformInfo{
980+
reconciler.PlatformInfo = platforminfo.PlatformInfo{
976981
Platform: platform,
977982
Version: "v4.14.0",
978983
}
@@ -989,7 +994,7 @@ var _ = Describe("Test computeSchedulerReplicas", func() {
989994
// recreate reconciler with nodes
990995
reconciler, err := NewFakeNUMAResourcesSchedulerReconciler(nodes...)
991996
Expect(err).ToNot(HaveOccurred())
992-
reconciler.PlatformInfo = PlatformInfo{
997+
reconciler.PlatformInfo = platforminfo.PlatformInfo{
993998
Platform: platform,
994999
Version: "v4.14.0",
9951000
}
@@ -1035,7 +1040,7 @@ var _ = Describe("Test computeSchedulerReplicas", func() {
10351040
Context("when replicas are explicitly set", func() {
10361041
It("should return the explicitly set replicas without checking nodes", func() {
10371042
// setup reconciler with HyperShift platform (uses worker nodes)
1038-
reconciler.PlatformInfo = PlatformInfo{
1043+
reconciler.PlatformInfo = platforminfo.PlatformInfo{
10391044
Platform: platform.HyperShift,
10401045
Version: "v4.14.0",
10411046
}
@@ -1044,7 +1049,7 @@ var _ = Describe("Test computeSchedulerReplicas", func() {
10441049
nodes := fakeNodes(3, 0) // only control-plane nodes
10451050
reconciler, err := NewFakeNUMAResourcesSchedulerReconciler(nodes...)
10461051
Expect(err).ToNot(HaveOccurred())
1047-
reconciler.PlatformInfo = PlatformInfo{
1052+
reconciler.PlatformInfo = platforminfo.PlatformInfo{
10481053
Platform: platform.HyperShift,
10491054
Version: "v4.14.0",
10501055
}
@@ -1066,57 +1071,6 @@ var _ = Describe("Test computeSchedulerReplicas", func() {
10661071
})
10671072
})
10681073

1069-
var _ = Describe("Test scheduler spec PreNormalize", func() {
1070-
When("Spec.SchedulerInformer is not set by the user", func() {
1071-
It("should override default informer to Shared if kubelet is fixed - first supported zstream version", func() {
1072-
v, _ := platform.ParseVersion(activePodsResourcesSupportSince)
1073-
spec := nropv1.NUMAResourcesSchedulerSpec{}
1074-
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
1075-
Expect(*spec.SchedulerInformer).To(Equal(nropv1.SchedulerInformerShared))
1076-
})
1077-
1078-
It("should override default informer to Shared if kubelet is fixed - version is greater than first supported (zstream)", func() {
1079-
v, _ := platform.ParseVersion("4.20.1000")
1080-
spec := nropv1.NUMAResourcesSchedulerSpec{}
1081-
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
1082-
Expect(*spec.SchedulerInformer).To(Equal(nropv1.SchedulerInformerShared))
1083-
})
1084-
1085-
It("should override default informer to Shared if kubelet is fixed - version is greater than first supported (ystream)", func() {
1086-
v, _ := platform.ParseVersion("4.21.0")
1087-
spec := nropv1.NUMAResourcesSchedulerSpec{}
1088-
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
1089-
Expect(*spec.SchedulerInformer).To(Equal(nropv1.SchedulerInformerShared))
1090-
})
1091-
1092-
It("should not override default informer if kubelet is not fixed - version is less than first supported (zstream)", func() {
1093-
// this is only for testing purposes as there is plan to backport the fix to older minor versions
1094-
// will need to remove this test if the fix is supported starting the first zstream of the release
1095-
v, _ := platform.ParseVersion("4.20.0")
1096-
spec := nropv1.NUMAResourcesSchedulerSpec{}
1097-
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
1098-
Expect(spec.SchedulerInformer).To(BeNil())
1099-
})
1100-
1101-
It("should not override default informer if kubelet is not fixed - version is less than first supported (ystream)", func() {
1102-
v, _ := platform.ParseVersion("4.13.0")
1103-
spec := nropv1.NUMAResourcesSchedulerSpec{}
1104-
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
1105-
Expect(spec.SchedulerInformer).To(BeNil())
1106-
})
1107-
})
1108-
When("Spec.SchedulerInformer is set by the user", func() {
1109-
It("should preserve informer value set by the user even if kubelet is fixed", func() {
1110-
v, _ := platform.ParseVersion(activePodsResourcesSupportSince)
1111-
spec := nropv1.NUMAResourcesSchedulerSpec{
1112-
SchedulerInformer: ptr.To(nropv1.SchedulerInformerDedicated),
1113-
}
1114-
platformNormalize(&spec, PlatformInfo{Platform: platform.OpenShift, Version: v})
1115-
Expect(*spec.SchedulerInformer).To(Equal(nropv1.SchedulerInformerDedicated))
1116-
})
1117-
})
1118-
})
1119-
11201074
func HaveTheSameResourceRequirements(expectedRR corev1.ResourceRequirements) types.GomegaMatcher {
11211075
return gcustom.MakeMatcher(func(actual *appsv1.Deployment) (bool, error) {
11221076
cntName := schedupdate.MainContainerName // shortcut

internal/platforminfo/platforminfo.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Copyright 2025 Red Hat, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package platforminfo
18+
19+
import (
20+
"github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform"
21+
)
22+
23+
// PlatformProperties represents the platform capabilities we have to infer and which are not explicitly
24+
// advertised from the platform using standard APIs.
25+
type PlatformProperties struct {
26+
PodResourcesListFilterActivePods bool
27+
}
28+
29+
type PlatformInfo struct {
30+
Platform platform.Platform
31+
Version platform.Version
32+
Properties PlatformProperties
33+
}
34+
35+
func New(plat platform.Platform, ver platform.Version) PlatformInfo {
36+
info := PlatformInfo{
37+
Platform: plat,
38+
Version: ver,
39+
}
40+
discoverProperties(&info)
41+
return info
42+
}
43+
44+
func discoverProperties(info *PlatformInfo) {
45+
info.Properties.PodResourcesListFilterActivePods = isVersionEnoughForPodresourcesListFilterActivePods(info.Platform, info.Version)
46+
}

0 commit comments

Comments
 (0)