diff --git a/api/v1/numaresourcesscheduler_types.go b/api/v1/numaresourcesscheduler_types.go index 0dc03b3c7..ef3805369 100644 --- a/api/v1/numaresourcesscheduler_types.go +++ b/api/v1/numaresourcesscheduler_types.go @@ -38,7 +38,7 @@ const ( type SchedulerInformerMode string const ( - // SchedulerInformerDedicated makes the NodeResourceTopologyMatch plugin use the default framework informer. + // SchedulerInformerShared makes the NodeResourceTopologyMatch plugin use the default framework informer. SchedulerInformerShared SchedulerInformerMode = "Shared" // SchedulerInformerDedicated sets an additional separate informer just for the NodeResourceTopologyMatch plugin. Default. diff --git a/cmd/main.go b/cmd/main.go index a58881cb4..c0f1ad91b 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -58,6 +58,7 @@ import ( "github.com/openshift-kni/numaresources-operator/internal/api/features" "github.com/openshift-kni/numaresources-operator/internal/controller" intkloglevel "github.com/openshift-kni/numaresources-operator/internal/kloglevel" + "github.com/openshift-kni/numaresources-operator/internal/platforminfo" "github.com/openshift-kni/numaresources-operator/pkg/hash" "github.com/openshift-kni/numaresources-operator/pkg/images" rtemetricsmanifests "github.com/openshift-kni/numaresources-operator/pkg/metrics/manifests/monitor" @@ -330,6 +331,7 @@ func main() { SchedulerManifests: schedMf, Namespace: namespace, AutodetectReplicas: info.NodeCount, + PlatformInfo: platforminfo.New(clusterPlatform, clusterPlatformVersion), }).SetupWithManager(mgr); err != nil { klog.ErrorS(err, "unable to create controller", "controller", "NUMAResourcesScheduler") os.Exit(1) diff --git a/internal/controller/numaresourcesscheduler_controller.go b/internal/controller/numaresourcesscheduler_controller.go index cd6d2055c..e5fc260af 100644 --- a/internal/controller/numaresourcesscheduler_controller.go +++ b/internal/controller/numaresourcesscheduler_controller.go @@ -29,6 +29,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/klog/v2" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" @@ -37,11 +38,13 @@ import ( "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform" k8swgmanifests "github.com/k8stopologyawareschedwg/deployer/pkg/manifests" k8swgrbacupdate "github.com/k8stopologyawareschedwg/deployer/pkg/objectupdate/rbac" nropv1 "github.com/openshift-kni/numaresources-operator/api/v1" "github.com/openshift-kni/numaresources-operator/internal/api/annotations" + "github.com/openshift-kni/numaresources-operator/internal/platforminfo" "github.com/openshift-kni/numaresources-operator/internal/relatedobjects" "github.com/openshift-kni/numaresources-operator/pkg/apply" "github.com/openshift-kni/numaresources-operator/pkg/hash" @@ -61,6 +64,7 @@ type NUMAResourcesSchedulerReconciler struct { SchedulerManifests schedmanifests.Manifests Namespace string AutodetectReplicas int + PlatformInfo platforminfo.PlatformInfo } //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterroles,verbs=* @@ -98,9 +102,11 @@ func (r *NUMAResourcesSchedulerReconciler) Reconcile(ctx context.Context, req ct return ctrl.Result{}, err } + initialStatus := *instance.Status.DeepCopy() + if req.Name != objectnames.DefaultNUMAResourcesSchedulerCrName { message := fmt.Sprintf("incorrect NUMAResourcesScheduler resource name: %s", instance.Name) - return ctrl.Result{}, r.updateStatus(ctx, instance, status.ConditionDegraded, status.ConditionTypeIncorrectNUMAResourcesSchedulerResourceName, message) + return ctrl.Result{}, r.updateStatus(ctx, initialStatus, instance, status.ConditionDegraded, status.ConditionTypeIncorrectNUMAResourcesSchedulerResourceName, message) } if annotations.IsPauseReconciliationEnabled(instance.Annotations) { @@ -109,7 +115,7 @@ func (r *NUMAResourcesSchedulerReconciler) Reconcile(ctx context.Context, req ct } result, condition, err := r.reconcileResource(ctx, instance) - if err := r.updateStatus(ctx, instance, condition, status.ReasonFromError(err), status.MessageFromError(err)); err != nil { + if err := r.updateStatus(ctx, initialStatus, instance, condition, status.ReasonFromError(err), status.MessageFromError(err)); err != nil { klog.InfoS("Failed to update numaresourcesscheduler status", "Desired condition", condition, "error", err) } @@ -184,6 +190,8 @@ func (r *NUMAResourcesSchedulerReconciler) syncNUMASchedulerResources(ctx contex klog.V(4).Info("SchedulerSync start") defer klog.V(4).Info("SchedulerSync stop") + platformNormalize(&instance.Spec, r.PlatformInfo) + schedSpec := instance.Spec.Normalize() cacheResyncPeriod := unpackAPIResyncPeriod(schedSpec.CacheResyncPeriod) params := configParamsFromSchedSpec(schedSpec, cacheResyncPeriod, r.Namespace) @@ -200,13 +208,15 @@ func (r *NUMAResourcesSchedulerReconciler) syncNUMASchedulerResources(ctx contex return nropv1.NUMAResourcesSchedulerStatus{}, err } - schedStatus := nropv1.NUMAResourcesSchedulerStatus{ - SchedulerName: schedSpec.SchedulerName, - CacheResyncPeriod: &metav1.Duration{ - Duration: cacheResyncPeriod, - }, + schedStatus := *instance.Status.DeepCopy() + schedStatus.SchedulerName = schedSpec.SchedulerName + schedStatus.CacheResyncPeriod = &metav1.Duration{ + Duration: cacheResyncPeriod, } + informerCondition := buildDedicatedInformerCondition(*instance, schedSpec) + schedStatus.Conditions = status.GetUpdatedSchedulerConditions(schedStatus.Conditions, informerCondition) + r.SchedulerManifests.Deployment.Spec.Replicas = r.computeSchedulerReplicas(schedSpec) klog.V(4).InfoS("using scheduler replicas", "replicas", *r.SchedulerManifests.Deployment.Spec.Replicas) // TODO: if replicas doesn't make sense (autodetect disabled and user set impossible value) then we @@ -251,8 +261,55 @@ func (r *NUMAResourcesSchedulerReconciler) syncNUMASchedulerResources(ctx contex return schedStatus, nil } -func (r *NUMAResourcesSchedulerReconciler) updateStatus(ctx context.Context, sched *nropv1.NUMAResourcesScheduler, condition string, reason string, message string) error { - sched.Status.Conditions, _ = status.UpdateConditions(sched.Status.Conditions, condition, reason, message) +func platformNormalize(spec *nropv1.NUMAResourcesSchedulerSpec, platInfo platforminfo.PlatformInfo) { + if platInfo.Platform != platform.OpenShift && platInfo.Platform != platform.HyperShift { + return + } + if spec.SchedulerInformer != nil { + // assume user-provided value. Nothing to do. + klog.V(4).InfoS("SchedulerInformer explicit value", "Platform", platInfo.Platform, "PlatformVersion", platInfo.Version.String(), "SchedulerInformer", *spec.SchedulerInformer) + return + } + if !platInfo.Properties.PodResourcesListFilterActivePods { + // keep shared default for backward compatibility. TODO: review/switch default in 4.21 + return + } + spec.SchedulerInformer = ptr.To(nropv1.SchedulerInformerShared) + klog.V(4).InfoS("SchedulerInformer default is overridden", "Platform", platInfo.Platform, "PlatformVersion", platInfo.Version.String(), "SchedulerInformer", *spec.SchedulerInformer) +} + +func buildDedicatedInformerCondition(instance nropv1.NUMAResourcesScheduler, normalized nropv1.NUMAResourcesSchedulerSpec) metav1.Condition { + condition := metav1.Condition{ + Type: status.ConditionDedicatedInformerActive, + Status: metav1.ConditionTrue, + ObservedGeneration: instance.ObjectMeta.Generation, + Reason: status.ConditionDedicatedInformerActive, + } + + if *normalized.SchedulerInformer == nropv1.SchedulerInformerShared { + condition.Status = metav1.ConditionFalse + } + + return condition +} + +func (r *NUMAResourcesSchedulerReconciler) updateStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesSchedulerStatus, sched *nropv1.NUMAResourcesScheduler, condition string, reason string, message string) error { + c := metav1.Condition{ + Type: condition, + Status: metav1.ConditionTrue, + Reason: reason, + Message: message, + } + sched.Status.Conditions = status.GetUpdatedSchedulerConditions(sched.Status.Conditions, c) + + if !status.NUMAResourcesSchedulerNeedsUpdate(initialStatus, sched.Status) { + return nil + } + + if status.EqualConditions(initialStatus.Conditions, sched.Status.Conditions) { + sched.Status.Conditions = initialStatus.Conditions + } + if err := r.Client.Status().Update(ctx, sched); err != nil { return fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(sched), err) } diff --git a/internal/controller/numaresourcesscheduler_controller_test.go b/internal/controller/numaresourcesscheduler_controller_test.go index b68adb3aa..844ccff62 100644 --- a/internal/controller/numaresourcesscheduler_controller_test.go +++ b/internal/controller/numaresourcesscheduler_controller_test.go @@ -38,15 +38,18 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes/scheme" "k8s.io/klog/v2" + "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform" depmanifests "github.com/k8stopologyawareschedwg/deployer/pkg/manifests" depobjupdate "github.com/k8stopologyawareschedwg/deployer/pkg/objectupdate" nropv1 "github.com/openshift-kni/numaresources-operator/api/v1" "github.com/openshift-kni/numaresources-operator/internal/api/annotations" testobjs "github.com/openshift-kni/numaresources-operator/internal/objects" + "github.com/openshift-kni/numaresources-operator/internal/platforminfo" "github.com/openshift-kni/numaresources-operator/pkg/hash" nrosched "github.com/openshift-kni/numaresources-operator/pkg/numaresourcesscheduler" schedmanifests "github.com/openshift-kni/numaresources-operator/pkg/numaresourcesscheduler/manifests/sched" @@ -214,6 +217,67 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() { gomega.Expect(nrs.Status.CacheResyncPeriod.Seconds()).To(gomega.Equal(resyncPeriod.Seconds())) }) + ginkgo.Context("should reflect DedicatedInformerActive in status conditions", func() { + ginkgo.It("with default values", func() { + key := client.ObjectKeyFromObject(nrs) + _, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + gomega.Expect(reconciler.Client.Get(context.TODO(), key, nrs)).To(gomega.Succeed()) + + c := getConditionByType(nrs.Status.Conditions, status.ConditionDedicatedInformerActive) + + gomega.Expect(c).ToNot(gomega.BeNil()) + gomega.Expect(c.Status).To(gomega.Equal(metav1.ConditionTrue)) + }) + + ginkgo.It("with updated values - explicitly configured to Dedicated", func() { + nrs := nrs.DeepCopy() + nrs.Spec.SchedulerInformer = ptr.To(nropv1.SchedulerInformerDedicated) + + gomega.Eventually(func() bool { + if err := reconciler.Client.Update(context.TODO(), nrs); err != nil { + klog.Warningf("failed to update the scheduler object; err: %v", err) + return false + } + return true + }, 30*time.Second, 5*time.Second).Should(gomega.BeTrue()) + + key := client.ObjectKeyFromObject(nrs) + _, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + gomega.Expect(reconciler.Client.Get(context.TODO(), key, nrs)).To(gomega.Succeed()) + + c := getConditionByType(nrs.Status.Conditions, status.ConditionDedicatedInformerActive) + + gomega.Expect(c).ToNot(gomega.BeNil()) + gomega.Expect(c.Status).To(gomega.Equal(metav1.ConditionTrue)) + }) + + ginkgo.It("with updated values - explicitly configured to Shared", func() { + nrs := nrs.DeepCopy() + nrs.Spec.SchedulerInformer = ptr.To(nropv1.SchedulerInformerShared) + + gomega.Eventually(func() bool { + if err := reconciler.Client.Update(context.TODO(), nrs); err != nil { + klog.Warningf("failed to update the scheduler object; err: %v", err) + return false + } + return true + }, 30*time.Second, 5*time.Second).Should(gomega.BeTrue()) + + key := client.ObjectKeyFromObject(nrs) + _, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + gomega.Expect(reconciler.Client.Get(context.TODO(), key, nrs)).To(gomega.Succeed()) + + c := getConditionByType(nrs.Status.Conditions, status.ConditionDedicatedInformerActive) + + gomega.Expect(c).ToNot(gomega.BeNil()) + gomega.Expect(c.Status).To(gomega.Equal(metav1.ConditionFalse)) + }) + }) + ginkgo.It("should have the correct priority class", func() { key := client.ObjectKeyFromObject(nrs) _, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key}) @@ -697,6 +761,128 @@ var _ = ginkgo.Describe("Test NUMAResourcesScheduler Reconcile", func() { gomega.Expect(dp).To(HaveTheSameResourceRequirements(expectedRR)) }) }) + + ginkgo.Context("with kubelet PodResourcesAPI listing active pods by default", func() { + var nrs *nropv1.NUMAResourcesScheduler + var reconciler *NUMAResourcesSchedulerReconciler + numOfMasters := 3 + + ginkgo.When("kubelet fix is enabled", func() { + fixedVersion, _ := platform.ParseVersion("4.19.10") + unfixedVersion, _ := platform.ParseVersion("4.19.0") // can't (and we must not even if we can) rewrite history + futureFixedVersionZstream, _ := platform.ParseVersion("4.19.20") // we must never regress + futureFixedVersion, _ := platform.ParseVersion("4.21.0") // we must never regress + + ginkgo.DescribeTable("should configure by default the informerMode to the expected when field is not set", func(reconcilerPlatInfo platforminfo.PlatformInfo, expectedInformer string) { + var err error + nrs = testobjs.NewNUMAResourcesScheduler("numaresourcesscheduler", "some/url:latest", testSchedulerName, 11*time.Second) + initObjects := []runtime.Object{nrs} + initObjects = append(initObjects, fakeNodes(numOfMasters, 3)...) + reconciler, err = NewFakeNUMAResourcesSchedulerReconciler(initObjects...) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + reconciler.PlatformInfo = reconcilerPlatInfo + + key := client.ObjectKeyFromObject(nrs) + _, err = reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + expectCacheParams(reconciler.Client, depmanifests.CacheResyncAutodetect, depmanifests.CacheResyncOnlyExclusiveResources, expectedInformer) + + expectedDedicatedActiveStatus := metav1.ConditionTrue + if expectedInformer == depmanifests.CacheInformerShared { + expectedDedicatedActiveStatus = metav1.ConditionFalse + } + + gomega.Expect(reconciler.Client.Get(context.TODO(), key, nrs)).To(gomega.Succeed()) + c := getConditionByType(nrs.Status.Conditions, status.ConditionDedicatedInformerActive) + + gomega.Expect(c).ToNot(gomega.BeNil()) + gomega.Expect(c.Status).To(gomega.Equal(expectedDedicatedActiveStatus)) + }, + ginkgo.Entry("with fixed Openshift the default informer is Shared", platforminfo.New(platform.OpenShift, fixedVersion), depmanifests.CacheInformerShared), + ginkgo.Entry("with fixed Hypershift the default informer is Shared", platforminfo.New(platform.HyperShift, fixedVersion), depmanifests.CacheInformerShared), + ginkgo.Entry("with unfixed platform the default informer is Dedicated (unchanged)", platforminfo.New(platform.OpenShift, unfixedVersion), depmanifests.CacheInformerDedicated), + ginkgo.Entry("with unfixed platform the default informer is Dedicated (unchanged)", platforminfo.New(platform.HyperShift, unfixedVersion), depmanifests.CacheInformerDedicated), + ginkgo.Entry("with fixed Openshift the default informer is Shared", platforminfo.New(platform.OpenShift, futureFixedVersion), depmanifests.CacheInformerShared), + ginkgo.Entry("with fixed Hypershift the default informer is Shared", platforminfo.New(platform.HyperShift, futureFixedVersion), depmanifests.CacheInformerShared), + ginkgo.Entry("with fixed Openshift the default informer is Shared", platforminfo.New(platform.OpenShift, futureFixedVersionZstream), depmanifests.CacheInformerShared), + ginkgo.Entry("with fixed Hypershift the default informer is Shared", platforminfo.New(platform.HyperShift, futureFixedVersionZstream), depmanifests.CacheInformerShared), + ginkgo.Entry("with unknown platform the default informer is Dedicated (unchanged)", platforminfo.PlatformInfo{}, depmanifests.CacheInformerDedicated), + ) + + ginkgo.DescribeTable("should preserve informerMode value if set", func(reconcilerPlatInfo platforminfo.PlatformInfo) { + var err error + nrs = testobjs.NewNUMAResourcesScheduler("numaresourcesscheduler", "some/url:latest", testSchedulerName, 11*time.Second) + infMode := nropv1.SchedulerInformerDedicated + nrs.Spec.SchedulerInformer = &infMode + initObjects := []runtime.Object{nrs} + initObjects = append(initObjects, fakeNodes(numOfMasters, 3)...) + reconciler, err = NewFakeNUMAResourcesSchedulerReconciler(initObjects...) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + reconciler.PlatformInfo = reconcilerPlatInfo + + key := client.ObjectKeyFromObject(nrs) + _, err = reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + expectCacheParams(reconciler.Client, depmanifests.CacheResyncAutodetect, depmanifests.CacheResyncOnlyExclusiveResources, string(infMode)) + }, + ginkgo.Entry("with Openshift", platforminfo.PlatformInfo{ + Platform: platform.OpenShift, + Version: fixedVersion, + }), + ginkgo.Entry("with Hypershift", platforminfo.PlatformInfo{ + Platform: platform.HyperShift, + Version: fixedVersion, + }), + ginkgo.Entry("with unknown platform", platforminfo.PlatformInfo{})) + + ginkgo.DescribeTable("should allow to update the informerMode to be Dedicated after an overridden default", func(reconcilerPlatInfo platforminfo.PlatformInfo) { + var err error + nrs = testobjs.NewNUMAResourcesScheduler("numaresourcesscheduler", "some/url:latest", testSchedulerName, 11*time.Second) + initObjects := []runtime.Object{nrs} + initObjects = append(initObjects, fakeNodes(numOfMasters, 3)...) + reconciler, err = NewFakeNUMAResourcesSchedulerReconciler(initObjects...) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + reconciler.PlatformInfo = reconcilerPlatInfo + + key := client.ObjectKeyFromObject(nrs) + _, err = reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + // intentionally skip checking default value + + // should query the object after reconcile because the defaults are overridden + gomega.Expect(reconciler.Client.Get(context.TODO(), key, nrs)).ToNot(gomega.HaveOccurred()) + + nrsUpdated := nrs.DeepCopy() + informerMode := nropv1.SchedulerInformerDedicated + nrsUpdated.Spec.SchedulerInformer = &informerMode + gomega.Eventually(func() bool { + if err := reconciler.Client.Update(context.TODO(), nrsUpdated); err != nil { + klog.Warningf("failed to update the scheduler object; err: %v", err) + return false + } + return true + }, 30*time.Second, 5*time.Second).Should(gomega.BeTrue()) + + _, err = reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: key}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + expectCacheParams(reconciler.Client, depmanifests.CacheResyncAutodetect, depmanifests.CacheResyncOnlyExclusiveResources, string(informerMode)) + }, + ginkgo.Entry("with Openshift", platforminfo.PlatformInfo{ + Platform: platform.OpenShift, + Version: fixedVersion, + }), + ginkgo.Entry("with Hypershift", platforminfo.PlatformInfo{ + Platform: platform.HyperShift, + Version: fixedVersion, + })) + }) + }) }) func HaveTheSameResourceRequirements(expectedRR corev1.ResourceRequirements) types.GomegaMatcher { diff --git a/internal/platforminfo/platforminfo.go b/internal/platforminfo/platforminfo.go new file mode 100644 index 000000000..d02da92cd --- /dev/null +++ b/internal/platforminfo/platforminfo.go @@ -0,0 +1,46 @@ +/* + * Copyright 2025 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package platforminfo + +import ( + "github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform" +) + +// PlatformProperties represents the platform capabilities we have to infer and which are not explicitly +// advertised from the platform using standard APIs. +type PlatformProperties struct { + PodResourcesListFilterActivePods bool +} + +type PlatformInfo struct { + Platform platform.Platform + Version platform.Version + Properties PlatformProperties +} + +func New(plat platform.Platform, ver platform.Version) PlatformInfo { + info := PlatformInfo{ + Platform: plat, + Version: ver, + } + discoverProperties(&info) + return info +} + +func discoverProperties(info *PlatformInfo) { + info.Properties.PodResourcesListFilterActivePods = isVersionEnoughForPodresourcesListFilterActivePods(info.Platform, info.Version) +} diff --git a/internal/platforminfo/platforminfo_test.go b/internal/platforminfo/platforminfo_test.go new file mode 100644 index 000000000..a33c048b7 --- /dev/null +++ b/internal/platforminfo/platforminfo_test.go @@ -0,0 +1,85 @@ +/* + * Copyright 2025 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package platforminfo + +import ( + "reflect" + "testing" + + "github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform" +) + +func TestNewWithDiscover(t *testing.T) { + testcases := []struct { + description string + plat platform.Platform + ver platform.Version + expected PlatformProperties + }{ + { + description: "empty", + }, + { + description: "last major unfixed", + plat: platform.OpenShift, + ver: mustParseVersion("4.19.0"), + expected: PlatformProperties{ + PodResourcesListFilterActivePods: false, + }, + }, + { + description: "first major fixed", + plat: platform.OpenShift, + ver: mustParseVersion("4.20.0"), // at time of writing + expected: PlatformProperties{ + PodResourcesListFilterActivePods: true, + }, + }, + { + description: "next Z-stream fixed, must never regress", // at time of writing + plat: platform.OpenShift, + ver: mustParseVersion("4.20.1"), + expected: PlatformProperties{ + PodResourcesListFilterActivePods: true, + }, + }, + { + description: "next major fixed, must never regress", // at time of writing + plat: platform.OpenShift, + ver: mustParseVersion("4.21.0"), + expected: PlatformProperties{ + PodResourcesListFilterActivePods: true, + }, + }, + } + for _, tc := range testcases { + t.Run(tc.description, func(t *testing.T) { + got := New(tc.plat, tc.ver) + if !reflect.DeepEqual(got.Properties, tc.expected) { + t.Errorf("expected %#v got %#v", tc.expected, got.Properties) + } + }) + } +} + +func mustParseVersion(ver string) platform.Version { + v, err := platform.ParseVersion(ver) + if err != nil { + panic(err) + } + return v +} diff --git a/internal/platforminfo/supportversion.go b/internal/platforminfo/supportversion.go new file mode 100644 index 000000000..36c777bb2 --- /dev/null +++ b/internal/platforminfo/supportversion.go @@ -0,0 +1,82 @@ +/* + * Copyright 2025 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package platforminfo + +import ( + "strings" + + "k8s.io/klog/v2" + + "github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform" +) + +// This file processes the known OCP platform versions, so far nightly, konflux-nightly, CI, dev-preview, and RC + +// supported version for the podresources List API fix. +// see: https://issues.redhat.com/browse/OCPBUGS-56785 +// see: https://github.com/kubernetes/kubernetes/pull/132028 +const ( + // first OCP build to have the fix is always the nightly build. Yet that doesn't necessarily mean that builds of other lanes + // (e.i CI) that produce the build after the specific nightly build date will certainly + // have the fix, thus we need to track them separately. + // releases that are already have thier first build GA do not have dev-preview or release-candidate builds + StableSupportSince = "4.19.10" + NightlySupportSince = "4.19.0-0.nightly-2025-08-14-135013" + CISupportSince = "4.19.0-0.ci-2025-09-01-150951" // *** the earliest that was found available at time of writing this +) + +func decodeMinimumVersion(version platform.Version) string { + v := version.String() + if strings.Contains(v, ".nightly-") { + return NightlySupportSince + } + if strings.Contains(v, ".ci-") { + return CISupportSince + } + if !strings.Contains(v, "-") { + return StableSupportSince + } + return "" +} + +func isVersionEnoughForPodresourcesListFilterActivePods(platf platform.Platform, version platform.Version) bool { + if platf != platform.OpenShift && platf != platform.HyperShift { + return false + } + + minVer := decodeMinimumVersion(version) + if minVer == "" { + // should never happen, so we are loud about it + klog.Infof("unrecognized version %q", version) + return false + } + + parsedVersion, err := platform.ParseVersion(minVer) + if err != nil { + // should never happen, so we are loud about it + klog.Infof("failed to parse version %q: %v", minVer, err) + return false + } + + ok, err := version.AtLeast(parsedVersion) + if err != nil { + klog.Infof("failed to compare version %v with %v, err %v", parsedVersion, version, err) + return false + } + + return ok +} diff --git a/internal/platforminfo/supportversion_test.go b/internal/platforminfo/supportversion_test.go new file mode 100644 index 000000000..10e7e7554 --- /dev/null +++ b/internal/platforminfo/supportversion_test.go @@ -0,0 +1,155 @@ +/* + * Copyright 2025 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package platforminfo + +import ( + "testing" + + "github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform" +) + +func TestDecodeMinimumVersion(t *testing.T) { + nightly, _ := platform.ParseVersion("4.19.0-0.nightly-2025-08-04-12") + ci, _ := platform.ParseVersion("4.19.0-0.ci-2025-08-04-12") + stable, _ := platform.ParseVersion("4.19.0") + unrecognized, _ := platform.ParseVersion("4.19.0-unknown") + + tests := []struct { + name string + version platform.Version + leastSupportExpected string + shouldBeUnrecognized bool + }{ + { + name: "nightly", + version: nightly, + leastSupportExpected: NightlySupportSince, + }, + { + name: "stable", + version: stable, + leastSupportExpected: StableSupportSince, + }, + { + name: "ci", + version: ci, + leastSupportExpected: CISupportSince, + }, + { + name: "unrecognized", + version: unrecognized, + leastSupportExpected: "", + shouldBeUnrecognized: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := decodeMinimumVersion(tt.version) + if tt.shouldBeUnrecognized { + if got != "" { + t.Fatalf("expected unrecognized, but got %q", tt.version) + } + return + } + + if !tt.shouldBeUnrecognized && got == "" { + t.Fatalf("unexpected unrecognized version: got %v from %v", got, tt.version) + } + + if got != tt.leastSupportExpected { + t.Errorf("DecodeMinimumVersion() got = %v, want %v", got, tt.leastSupportExpected) + } + }) + } +} + +func TestIsVersionEnoughForPodresourcesListFilterActivePods(t *testing.T) { + nightlyGreater, _ := platform.ParseVersion("4.19.0-0.nightly-2025-08-24-154810") + ciGreater, _ := platform.ParseVersion("4.19.0-0.ci-2025-10-00-000000") + stableGreater, _ := platform.ParseVersion("4.19.11") + + unsupportedNightly, _ := platform.ParseVersion("4.19.0-0.nightly-2024-08-04-150000") + unsupportedStable, _ := platform.ParseVersion("4.19.0") + + tests := []struct { + name string + platf platform.Platform + version platform.Version + want bool + }{ + { + name: "empty", + want: false, + }, + { + name: "nightly - least supported", + platf: platform.OpenShift, + version: NightlySupportSince, + want: true, + }, + { + name: "nightly - greater than least supported", + platf: platform.OpenShift, + version: nightlyGreater, + want: true, + }, + { + name: "nightly - unsupported", + platf: platform.OpenShift, + version: unsupportedNightly, + want: false, + }, + { + name: "stable - least supported", + platf: platform.OpenShift, + version: StableSupportSince, + want: true, + }, + { + name: "stable - greater than least supported", + platf: platform.OpenShift, + version: stableGreater, + want: true, + }, + { + name: "stable - unsupported", + platf: platform.OpenShift, + version: unsupportedStable, + want: false, + }, + { + name: "CI - least supported", + platf: platform.OpenShift, + version: CISupportSince, + want: true, + }, + { + name: "CI - greater than least supported", + platf: platform.OpenShift, + version: ciGreater, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := isVersionEnoughForPodresourcesListFilterActivePods(tt.platf, tt.version); got != tt.want { + t.Errorf("isVersionEnoughForPodresourcesListFilterActivePods() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/status/status.go b/pkg/status/status.go index b350179c1..5d5c73e84 100644 --- a/pkg/status/status.go +++ b/pkg/status/status.go @@ -34,6 +34,10 @@ const ( ConditionUpgradeable = "Upgradeable" ) +const ( + ConditionDedicatedInformerActive = "DedicatedInformerActive" +) + // TODO: are we duping these? const ( ReasonAsExpected = "AsExpected" @@ -58,14 +62,34 @@ func IsUpdatedNUMAResourcesOperator(oldStatus, newStatus *nropv1.NUMAResourcesOp return !reflect.DeepEqual(os, ns) } +func NUMAResourcesSchedulerNeedsUpdate(oldStatus, newStatus nropv1.NUMAResourcesSchedulerStatus) bool { + os := oldStatus.DeepCopy() + ns := newStatus.DeepCopy() + + resetIncomparableConditionFields(os.Conditions) + resetIncomparableConditionFields(ns.Conditions) + + return !reflect.DeepEqual(os, ns) +} + +func EqualConditions(current, updated []metav1.Condition) bool { + c := CloneConditions(current) + u := CloneConditions(updated) + + resetIncomparableConditionFields(c) + resetIncomparableConditionFields(u) + + return reflect.DeepEqual(c, u) +} + // UpdateConditions compute new conditions based on arguments, and then compare with given current conditions. // Returns the conditions to use, either current or newly computed, and a boolean flag which is `true` if conditions need // update - so if they are updated since the current conditions. func UpdateConditions(currentConditions []metav1.Condition, condition string, reason string, message string) ([]metav1.Condition, bool) { conditions := NewConditions(condition, reason, message) - cond := clone(conditions) - curCond := clone(currentConditions) + cond := CloneConditions(conditions) + curCond := CloneConditions(currentConditions) resetIncomparableConditionFields(cond) resetIncomparableConditionFields(curCond) @@ -134,6 +158,42 @@ func newBaseConditions() []metav1.Condition { } } +func NewNUMAResourcesSchedulerBaseConditions() []metav1.Condition { + now := time.Now() + return []metav1.Condition{ + { + Type: ConditionAvailable, + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.Time{Time: now}, + Reason: ConditionAvailable, + }, + { + Type: ConditionUpgradeable, + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.Time{Time: now}, + Reason: ConditionUpgradeable, + }, + { + Type: ConditionProgressing, + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.Time{Time: now}, + Reason: ConditionProgressing, + }, + { + Type: ConditionDegraded, + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.Time{Time: now}, + Reason: ConditionDegraded, + }, + { + Type: ConditionDedicatedInformerActive, + Status: metav1.ConditionUnknown, + LastTransitionTime: metav1.Time{Time: now}, + Reason: ConditionDedicatedInformerActive, + }, + } +} + func ReasonFromError(err error) string { if err == nil { return ReasonAsExpected @@ -159,8 +219,37 @@ func resetIncomparableConditionFields(conditions []metav1.Condition) { } } -func clone(conditions []metav1.Condition) []metav1.Condition { +func CloneConditions(conditions []metav1.Condition) []metav1.Condition { var c = make([]metav1.Condition, len(conditions)) copy(c, conditions) return c } + +func GetUpdatedSchedulerConditions(conditions []metav1.Condition, condition metav1.Condition) []metav1.Condition { + updatedConditions := NewNUMAResourcesSchedulerBaseConditions() + if len(conditions) != 0 { + updatedConditions = CloneConditions(conditions) + } + + now := time.Now() + for idx, cond := range updatedConditions { + if cond.Type == condition.Type { + updatedConditions[idx] = condition + updatedConditions[idx].LastTransitionTime = metav1.Time{Time: now} + break + } + } + + if condition.Type == ConditionAvailable { + for idx, cond := range updatedConditions { + if cond.Type == ConditionUpgradeable { + updatedConditions[idx].Status = condition.Status + updatedConditions[idx].LastTransitionTime = metav1.Time{Time: now} + + break + } + } + } + + return updatedConditions +} diff --git a/pkg/status/status_test.go b/pkg/status/status_test.go index b8edc38f8..2932cfe0e 100644 --- a/pkg/status/status_test.go +++ b/pkg/status/status_test.go @@ -20,11 +20,14 @@ import ( "context" "errors" "fmt" + "reflect" "testing" "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/scheme" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -191,6 +194,58 @@ func TestIsUpdatedNUMAResourcesOperator(t *testing.T) { } } +func TestNUMAResourcesSchedulerNeedsUpdate(t *testing.T) { + type testCase struct { + name string + oldStatus nropv1.NUMAResourcesSchedulerStatus + updaterFunc func(*nropv1.NUMAResourcesSchedulerStatus) + expectedUpdated bool + } + testCases := []testCase{ + { + name: "empty status, no change", + oldStatus: nropv1.NUMAResourcesSchedulerStatus{}, + updaterFunc: func(st *nropv1.NUMAResourcesSchedulerStatus) {}, + expectedUpdated: false, + }, + { + name: "status, conditions, updated only time", + oldStatus: nropv1.NUMAResourcesSchedulerStatus{ + Conditions: NewConditions(ConditionAvailable, "test all good", "testing info"), + }, + updaterFunc: func(st *nropv1.NUMAResourcesSchedulerStatus) { + time.Sleep(42 * time.Millisecond) // make sure the timestamp changed + st.Conditions = NewConditions(ConditionAvailable, "test all good", "testing info") + }, + expectedUpdated: false, + }, + { + name: "status, conditions, updated only time, other fields changed", + oldStatus: nropv1.NUMAResourcesSchedulerStatus{ + Conditions: NewConditions(ConditionAvailable, "test all good", "testing info"), + }, + updaterFunc: func(st *nropv1.NUMAResourcesSchedulerStatus) { + time.Sleep(42 * time.Millisecond) // make sure the timestamp changed + st.Conditions = NewConditions(ConditionAvailable, "test all good", "testing info") + st.CacheResyncPeriod = ptr.To(metav1.Duration{Duration: 42 * time.Second}) + }, + expectedUpdated: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + oldStatus := *tc.oldStatus.DeepCopy() + newStatus := *tc.oldStatus.DeepCopy() + tc.updaterFunc(&newStatus) + got := NUMAResourcesSchedulerNeedsUpdate(oldStatus, newStatus) + if got != tc.expectedUpdated { + t.Errorf("isUpdated %v expected %v", got, tc.expectedUpdated) + } + }) + } +} + func TestReasonFromError(t *testing.T) { type testCase struct { name string @@ -270,3 +325,164 @@ func TestMessageFromError(t *testing.T) { }) } } + +func TestGetUpdatedSchedulerConditions(t *testing.T) { + tests := []struct { + name string + conditions []metav1.Condition + condition metav1.Condition + expected []metav1.Condition + }{ + { + name: "first reconcile iteration - with operator condition", + condition: metav1.Condition{ + Type: ConditionAvailable, + Status: metav1.ConditionTrue, + Reason: ConditionAvailable, + Message: "test", + }, + expected: []metav1.Condition{ + { + Type: ConditionAvailable, + Status: metav1.ConditionTrue, + Reason: ConditionAvailable, + Message: "test", + }, + { + Type: ConditionUpgradeable, + Status: metav1.ConditionTrue, + Reason: ConditionUpgradeable, + }, + { + Type: ConditionProgressing, + Status: metav1.ConditionFalse, + Reason: ConditionProgressing, + }, + { + Type: ConditionDegraded, + Status: metav1.ConditionFalse, + Reason: ConditionDegraded, + }, + { + Type: ConditionDedicatedInformerActive, + Status: metav1.ConditionUnknown, + Reason: ConditionDedicatedInformerActive, + }, + }, + }, + { + name: "first reconcile iteration - with informer condition", + condition: metav1.Condition{ + Type: ConditionDedicatedInformerActive, + Status: metav1.ConditionTrue, + Reason: ConditionDedicatedInformerActive, + Message: "test", + }, + expected: []metav1.Condition{ + { + Type: ConditionAvailable, + Status: metav1.ConditionFalse, + Reason: ConditionAvailable, + }, + { + Type: ConditionUpgradeable, + Status: metav1.ConditionFalse, + Reason: ConditionUpgradeable, + }, + { + Type: ConditionProgressing, + Status: metav1.ConditionFalse, + Reason: ConditionProgressing, + }, + { + Type: ConditionDegraded, + Status: metav1.ConditionFalse, + Reason: ConditionDegraded, + }, + { + Type: ConditionDedicatedInformerActive, + Status: metav1.ConditionTrue, + Reason: ConditionDedicatedInformerActive, + Message: "test", + }, + }, + }, + { + name: "non-empty with informer condition", + conditions: []metav1.Condition{ + { + Type: ConditionAvailable, + Status: metav1.ConditionTrue, + Reason: ConditionAvailable, + }, + { + Type: ConditionUpgradeable, + Status: metav1.ConditionTrue, + Reason: ConditionUpgradeable, + }, + { + Type: ConditionProgressing, + Status: metav1.ConditionTrue, + Reason: ConditionProgressing, + }, + { + Type: ConditionDegraded, + Status: metav1.ConditionTrue, + Reason: ConditionDegraded, + }, + { + Type: ConditionDedicatedInformerActive, + Status: metav1.ConditionTrue, + Reason: ConditionDedicatedInformerActive, + Message: "test", + }, + }, + condition: metav1.Condition{ + Type: ConditionDedicatedInformerActive, + Status: metav1.ConditionFalse, + Reason: ConditionDedicatedInformerActive, + Message: "test3", + }, + expected: []metav1.Condition{ + { + Type: ConditionAvailable, + Status: metav1.ConditionTrue, + Reason: ConditionAvailable, + }, + { + Type: ConditionUpgradeable, + Status: metav1.ConditionTrue, + Reason: ConditionUpgradeable, + }, + { + Type: ConditionProgressing, + Status: metav1.ConditionTrue, + Reason: ConditionProgressing, + }, + { + Type: ConditionDegraded, + Status: metav1.ConditionTrue, + Reason: ConditionDegraded, + }, + { + Type: ConditionDedicatedInformerActive, + Status: metav1.ConditionFalse, + Reason: ConditionDedicatedInformerActive, + Message: "test3", + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := GetUpdatedSchedulerConditions(tt.conditions, tt.condition) + + resetIncomparableConditionFields(got) + resetIncomparableConditionFields(tt.expected) + + if !reflect.DeepEqual(got, tt.expected) { + t.Errorf("mismatching conditions got\n%v\nexpected\n%v\n", got, tt.expected) + } + }) + } +}