Skip to content

Commit 2309b2b

Browse files
authored
feat: update RayCluster .status.reason field with pod creation error (ray-project#639)
* feat: update RayCluster `.status.reason` field with pod creation error Makes RayCluster errors related to Pod creation easier more apparent to user.
1 parent 1d3c3a4 commit 2309b2b

9 files changed

Lines changed: 76 additions & 1 deletion

File tree

helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11146,6 +11146,9 @@ spec:
1114611146
each node group.
1114711147
format: int32
1114811148
type: integer
11149+
reason:
11150+
description: Reason provides more information about current State
11151+
type: string
1114911152
state:
1115011153
description: 'INSERT ADDITIONAL STATUS FIELD - define observed state
1115111154
of cluster Important: Run "make" to regenerat'

helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11704,6 +11704,9 @@ spec:
1170411704
of each node group.
1170511705
format: int32
1170611706
type: integer
11707+
reason:
11708+
description: Reason provides more information about current State
11709+
type: string
1170711710
state:
1170811711
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
1170911712
state of cluster Important: Run "make" to regenerat'

helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11760,6 +11760,10 @@ spec:
1176011760
of each node group.
1176111761
format: int32
1176211762
type: integer
11763+
reason:
11764+
description: Reason provides more information about current
11765+
State
11766+
type: string
1176311767
state:
1176411768
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
1176511769
state of cluster Important: Run "make" to regenerat'
@@ -11867,6 +11871,10 @@ spec:
1186711871
of each node group.
1186811872
format: int32
1186911873
type: integer
11874+
reason:
11875+
description: Reason provides more information about current
11876+
State
11877+
type: string
1187011878
state:
1187111879
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
1187211880
state of cluster Important: Run "make" to regenerat'

ray-operator/apis/ray/v1alpha1/raycluster_types.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,8 @@ type RayClusterStatus struct {
119119
Endpoints map[string]string `json:"endpoints,omitempty"`
120120
// Head info
121121
Head HeadInfo `json:"head,omitempty"`
122+
// Reason provides more information about current State
123+
Reason string `json:"reason,omitempty"`
122124
}
123125

124126
// HeadInfo gives info about head
@@ -167,5 +169,6 @@ func init() {
167169
type EventReason string
168170

169171
const (
170-
RayConfigError EventReason = "RayConfigError"
172+
RayConfigError EventReason = "RayConfigError"
173+
PodReconciliationError EventReason = "PodReconciliationError"
171174
)

ray-operator/config/crd/bases/ray.io_rayclusters.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11146,6 +11146,9 @@ spec:
1114611146
each node group.
1114711147
format: int32
1114811148
type: integer
11149+
reason:
11150+
description: Reason provides more information about current State
11151+
type: string
1114911152
state:
1115011153
description: 'INSERT ADDITIONAL STATUS FIELD - define observed state
1115111154
of cluster Important: Run "make" to regenerat'

ray-operator/config/crd/bases/ray.io_rayjobs.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11704,6 +11704,9 @@ spec:
1170411704
of each node group.
1170511705
format: int32
1170611706
type: integer
11707+
reason:
11708+
description: Reason provides more information about current State
11709+
type: string
1170711710
state:
1170811711
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
1170911712
state of cluster Important: Run "make" to regenerat'

ray-operator/config/crd/bases/ray.io_rayservices.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11760,6 +11760,10 @@ spec:
1176011760
of each node group.
1176111761
format: int32
1176211762
type: integer
11763+
reason:
11764+
description: Reason provides more information about current
11765+
State
11766+
type: string
1176311767
state:
1176411768
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
1176511769
state of cluster Important: Run "make" to regenerat'
@@ -11867,6 +11871,10 @@ spec:
1186711871
of each node group.
1186811872
format: int32
1186911873
type: integer
11874+
reason:
11875+
description: Reason provides more information about current
11876+
State
11877+
type: string
1187011878
state:
1187111879
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
1187211880
state of cluster Important: Run "make" to regenerat'

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
3232
"sigs.k8s.io/controller-runtime/pkg/handler"
3333
"sigs.k8s.io/controller-runtime/pkg/manager"
34+
"sigs.k8s.io/controller-runtime/pkg/predicate"
3435
"sigs.k8s.io/controller-runtime/pkg/reconcile"
3536
"sigs.k8s.io/controller-runtime/pkg/source"
3637
)
@@ -214,6 +215,10 @@ func (r *RayClusterReconciler) rayClusterReconcile(request ctrl.Request, instanc
214215
if updateErr := r.updateClusterState(instance, rayiov1alpha1.Failed); updateErr != nil {
215216
r.Log.Error(updateErr, "RayCluster update state error", "cluster name", request.Name)
216217
}
218+
if updateErr := r.updateClusterReason(instance, err.Error()); updateErr != nil {
219+
r.Log.Error(updateErr, "RayCluster update reason error", "cluster name", request.Name)
220+
}
221+
r.Recorder.Event(instance, corev1.EventTypeWarning, string(rayiov1alpha1.PodReconciliationError), err.Error())
217222
return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, err
218223
}
219224
// update the status if needed
@@ -762,6 +767,7 @@ func (r *RayClusterReconciler) buildWorkerPod(instance rayiov1alpha1.RayCluster,
762767
func (r *RayClusterReconciler) SetupWithManager(mgr ctrl.Manager, reconcileConcurrency int) error {
763768
return ctrl.NewControllerManagedBy(mgr).
764769
For(&rayiov1alpha1.RayCluster{}).Named("raycluster-controller").
770+
WithEventFilter(predicate.Or(predicate.GenerationChangedPredicate{}, predicate.LabelChangedPredicate{}, predicate.AnnotationChangedPredicate{})).
765771
Watches(&source.Kind{Type: &corev1.Event{}}, &handler.EnqueueRequestForObject{}).
766772
Watches(&source.Kind{Type: &corev1.Pod{}}, &handler.EnqueueRequestForOwner{
767773
IsController: true,
@@ -1049,3 +1055,8 @@ func (r *RayClusterReconciler) updateClusterState(instance *rayiov1alpha1.RayClu
10491055
instance.Status.State = clusterState
10501056
return r.Status().Update(context.Background(), instance)
10511057
}
1058+
1059+
func (r *RayClusterReconciler) updateClusterReason(instance *rayiov1alpha1.RayCluster, clusterReason string) error {
1060+
instance.Status.Reason = clusterReason
1061+
return r.Status().Update(context.Background(), instance)
1062+
}

ray-operator/controllers/ray/raycluster_controller_fake_test.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -774,6 +774,39 @@ func TestReconcile_AutoscalerRoleBinding(t *testing.T) {
774774
assert.Nil(t, err, "Fail to get autoscaler RoleBinding after reconciliation")
775775
}
776776

777+
func TestReconcile_UpdateClusterReason(t *testing.T) {
778+
setupTest(t)
779+
defer tearDown(t)
780+
newScheme := runtime.NewScheme()
781+
_ = rayiov1alpha1.AddToScheme(newScheme)
782+
783+
fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(testRayCluster).Build()
784+
785+
namespacedName := types.NamespacedName{
786+
Name: instanceName,
787+
Namespace: namespaceStr,
788+
}
789+
cluster := rayiov1alpha1.RayCluster{}
790+
err := fakeClient.Get(context.Background(), namespacedName, &cluster)
791+
assert.Nil(t, err, "Fail to get RayCluster")
792+
assert.Empty(t, cluster.Status.Reason, "Cluster reason should be empty")
793+
794+
testRayClusterReconciler := &RayClusterReconciler{
795+
Client: fakeClient,
796+
Recorder: &record.FakeRecorder{},
797+
Scheme: scheme.Scheme,
798+
Log: ctrl.Log.WithName("controllers").WithName("RayCluster"),
799+
}
800+
reason := "test reason"
801+
802+
err = testRayClusterReconciler.updateClusterReason(testRayCluster, reason)
803+
assert.Nil(t, err, "Fail to update cluster reason")
804+
805+
err = fakeClient.Get(context.Background(), namespacedName, &cluster)
806+
assert.Nil(t, err, "Fail to get RayCluster after updating reason")
807+
assert.Equal(t, cluster.Status.Reason, reason, "Cluster reason should be updated")
808+
}
809+
777810
func TestUpdateEndpoints(t *testing.T) {
778811
setupTest(t)
779812
defer tearDown(t)

0 commit comments

Comments
 (0)