diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml index db6b834e37f..409c70b9bbd 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml @@ -11146,6 +11146,9 @@ spec: each node group. format: int32 type: integer + reason: + description: Reason provides more information about current State + type: string state: description: 'INSERT ADDITIONAL STATUS FIELD - define observed state of cluster Important: Run "make" to regenerat' diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml index ddd9dfea777..d96c8b366ca 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml @@ -11704,6 +11704,9 @@ spec: of each node group. format: int32 type: integer + reason: + description: Reason provides more information about current State + type: string state: description: 'INSERT ADDITIONAL STATUS FIELD - define observed state of cluster Important: Run "make" to regenerat' diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml index db8f28c4892..2fc6132f29d 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml @@ -11760,6 +11760,10 @@ spec: of each node group. format: int32 type: integer + reason: + description: Reason provides more information about current + State + type: string state: description: 'INSERT ADDITIONAL STATUS FIELD - define observed state of cluster Important: Run "make" to regenerat' @@ -11867,6 +11871,10 @@ spec: of each node group. format: int32 type: integer + reason: + description: Reason provides more information about current + State + type: string state: description: 'INSERT ADDITIONAL STATUS FIELD - define observed state of cluster Important: Run "make" to regenerat' diff --git a/ray-operator/apis/ray/v1alpha1/raycluster_types.go b/ray-operator/apis/ray/v1alpha1/raycluster_types.go index 612bd525660..befb12d98fc 100644 --- a/ray-operator/apis/ray/v1alpha1/raycluster_types.go +++ b/ray-operator/apis/ray/v1alpha1/raycluster_types.go @@ -119,6 +119,8 @@ type RayClusterStatus struct { Endpoints map[string]string `json:"endpoints,omitempty"` // Head info Head HeadInfo `json:"head,omitempty"` + // Reason provides more information about current State + Reason string `json:"reason,omitempty"` } // HeadInfo gives info about head @@ -167,5 +169,6 @@ func init() { type EventReason string const ( - RayConfigError EventReason = "RayConfigError" + RayConfigError EventReason = "RayConfigError" + PodReconciliationError EventReason = "PodReconciliationError" ) diff --git a/ray-operator/config/crd/bases/ray.io_rayclusters.yaml b/ray-operator/config/crd/bases/ray.io_rayclusters.yaml index db6b834e37f..409c70b9bbd 100644 --- a/ray-operator/config/crd/bases/ray.io_rayclusters.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayclusters.yaml @@ -11146,6 +11146,9 @@ spec: each node group. format: int32 type: integer + reason: + description: Reason provides more information about current State + type: string state: description: 'INSERT ADDITIONAL STATUS FIELD - define observed state of cluster Important: Run "make" to regenerat' diff --git a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml index ddd9dfea777..d96c8b366ca 100644 --- a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml @@ -11704,6 +11704,9 @@ spec: of each node group. format: int32 type: integer + reason: + description: Reason provides more information about current State + type: string state: description: 'INSERT ADDITIONAL STATUS FIELD - define observed state of cluster Important: Run "make" to regenerat' diff --git a/ray-operator/config/crd/bases/ray.io_rayservices.yaml b/ray-operator/config/crd/bases/ray.io_rayservices.yaml index db8f28c4892..2fc6132f29d 100644 --- a/ray-operator/config/crd/bases/ray.io_rayservices.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayservices.yaml @@ -11760,6 +11760,10 @@ spec: of each node group. format: int32 type: integer + reason: + description: Reason provides more information about current + State + type: string state: description: 'INSERT ADDITIONAL STATUS FIELD - define observed state of cluster Important: Run "make" to regenerat' @@ -11867,6 +11871,10 @@ spec: of each node group. format: int32 type: integer + reason: + description: Reason provides more information about current + State + type: string state: description: 'INSERT ADDITIONAL STATUS FIELD - define observed state of cluster Important: Run "make" to regenerat' diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 03995916323..6c62e3fd91e 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -31,6 +31,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/source" ) @@ -214,6 +215,10 @@ func (r *RayClusterReconciler) rayClusterReconcile(request ctrl.Request, instanc if updateErr := r.updateClusterState(instance, rayiov1alpha1.Failed); updateErr != nil { r.Log.Error(updateErr, "RayCluster update state error", "cluster name", request.Name) } + if updateErr := r.updateClusterReason(instance, err.Error()); updateErr != nil { + r.Log.Error(updateErr, "RayCluster update reason error", "cluster name", request.Name) + } + r.Recorder.Event(instance, corev1.EventTypeWarning, string(rayiov1alpha1.PodReconciliationError), err.Error()) return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, err } // update the status if needed @@ -762,6 +767,7 @@ func (r *RayClusterReconciler) buildWorkerPod(instance rayiov1alpha1.RayCluster, func (r *RayClusterReconciler) SetupWithManager(mgr ctrl.Manager, reconcileConcurrency int) error { return ctrl.NewControllerManagedBy(mgr). For(&rayiov1alpha1.RayCluster{}).Named("raycluster-controller"). + WithEventFilter(predicate.Or(predicate.GenerationChangedPredicate{}, predicate.LabelChangedPredicate{}, predicate.AnnotationChangedPredicate{})). Watches(&source.Kind{Type: &corev1.Event{}}, &handler.EnqueueRequestForObject{}). Watches(&source.Kind{Type: &corev1.Pod{}}, &handler.EnqueueRequestForOwner{ IsController: true, @@ -1049,3 +1055,8 @@ func (r *RayClusterReconciler) updateClusterState(instance *rayiov1alpha1.RayClu instance.Status.State = clusterState return r.Status().Update(context.Background(), instance) } + +func (r *RayClusterReconciler) updateClusterReason(instance *rayiov1alpha1.RayCluster, clusterReason string) error { + instance.Status.Reason = clusterReason + return r.Status().Update(context.Background(), instance) +} diff --git a/ray-operator/controllers/ray/raycluster_controller_fake_test.go b/ray-operator/controllers/ray/raycluster_controller_fake_test.go index 882ac5462a2..f1b9e21b01e 100644 --- a/ray-operator/controllers/ray/raycluster_controller_fake_test.go +++ b/ray-operator/controllers/ray/raycluster_controller_fake_test.go @@ -774,6 +774,39 @@ func TestReconcile_AutoscalerRoleBinding(t *testing.T) { assert.Nil(t, err, "Fail to get autoscaler RoleBinding after reconciliation") } +func TestReconcile_UpdateClusterReason(t *testing.T) { + setupTest(t) + defer tearDown(t) + newScheme := runtime.NewScheme() + _ = rayiov1alpha1.AddToScheme(newScheme) + + fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(testRayCluster).Build() + + namespacedName := types.NamespacedName{ + Name: instanceName, + Namespace: namespaceStr, + } + cluster := rayiov1alpha1.RayCluster{} + err := fakeClient.Get(context.Background(), namespacedName, &cluster) + assert.Nil(t, err, "Fail to get RayCluster") + assert.Empty(t, cluster.Status.Reason, "Cluster reason should be empty") + + testRayClusterReconciler := &RayClusterReconciler{ + Client: fakeClient, + Recorder: &record.FakeRecorder{}, + Scheme: scheme.Scheme, + Log: ctrl.Log.WithName("controllers").WithName("RayCluster"), + } + reason := "test reason" + + err = testRayClusterReconciler.updateClusterReason(testRayCluster, reason) + assert.Nil(t, err, "Fail to update cluster reason") + + err = fakeClient.Get(context.Background(), namespacedName, &cluster) + assert.Nil(t, err, "Fail to get RayCluster after updating reason") + assert.Equal(t, cluster.Status.Reason, reason, "Cluster reason should be updated") +} + func TestUpdateEndpoints(t *testing.T) { setupTest(t) defer tearDown(t)