Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11146,6 +11146,9 @@ spec:
each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed state
of cluster Important: Run "make" to regenerat'
Expand Down
3 changes: 3 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11704,6 +11704,9 @@ spec:
of each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
state of cluster Important: Run "make" to regenerat'
Expand Down
8 changes: 8 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11760,6 +11760,10 @@ spec:
of each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current
State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
state of cluster Important: Run "make" to regenerat'
Expand Down Expand Up @@ -11867,6 +11871,10 @@ spec:
of each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current
State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
state of cluster Important: Run "make" to regenerat'
Expand Down
5 changes: 4 additions & 1 deletion ray-operator/apis/ray/v1alpha1/raycluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ type RayClusterStatus struct {
Endpoints map[string]string `json:"endpoints,omitempty"`
// Head info
Head HeadInfo `json:"head,omitempty"`
// Reason provides more information about current State
Reason string `json:"reason,omitempty"`
}

// HeadInfo gives info about head
Expand Down Expand Up @@ -167,5 +169,6 @@ func init() {
type EventReason string

const (
RayConfigError EventReason = "RayConfigError"
RayConfigError EventReason = "RayConfigError"
PodReconciliationError EventReason = "PodReconciliationError"
)
3 changes: 3 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11146,6 +11146,9 @@ spec:
each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed state
of cluster Important: Run "make" to regenerat'
Expand Down
3 changes: 3 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11704,6 +11704,9 @@ spec:
of each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
state of cluster Important: Run "make" to regenerat'
Expand Down
8 changes: 8 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11760,6 +11760,10 @@ spec:
of each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current
State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
state of cluster Important: Run "make" to regenerat'
Expand Down Expand Up @@ -11867,6 +11871,10 @@ spec:
of each node group.
format: int32
type: integer
reason:
description: Reason provides more information about current
State
type: string
state:
description: 'INSERT ADDITIONAL STATUS FIELD - define observed
state of cluster Important: Run "make" to regenerat'
Expand Down
11 changes: 11 additions & 0 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/controller-runtime/pkg/source"
)
Expand Down Expand Up @@ -214,6 +215,10 @@ func (r *RayClusterReconciler) rayClusterReconcile(request ctrl.Request, instanc
if updateErr := r.updateClusterState(instance, rayiov1alpha1.Failed); updateErr != nil {
r.Log.Error(updateErr, "RayCluster update state error", "cluster name", request.Name)
}
if updateErr := r.updateClusterReason(instance, err.Error()); updateErr != nil {
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@DmitriGekhtman The reconcile loop doesn't cease because the err.Error() string here is always different. It begins with pods "POD_NAME" is forbidden: exceeded quota: quota... and POD_NAME keeps changing. So updating the .status.reason queues the RayCluster for another reconciliation which updates .status.reason which queues it again, on and on.

I'll make the string here stable which should fix.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, that's interesting! Outputting the same error sgtm.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@DmitriGekhtman this is ugly and brittle. Is there a way to have the reconcile loop ignore changes to .status.reason or perhaps .status altogether? I want to do something like this. I think relevant code is here and/or here? But not sure and need help.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, maybe we could just set the reason to something like "Pod reconcile failed" and then display the full error by emitting an event, which would be easily accessible via kubectl describe.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sgtm, I’ll take a look soon

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use Or to also watch for changes to Labels and Annotations.

CR annotations are currently taken into account by the operator.
Labels are not taken into account for now, but they might be later.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The con of ignoring status changes is that extraneous status changes are not quickly fixed. I think that's a minor issue, though.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, updated. Also included emitting event since it's small and straightforward. Lmk if there's any good examples of how to test event or label/annotation update-trigger-reconciliation logic.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tested on my cluster. Updating labels or annotations triggers reconciliation as expected. Events also show up as expected. See gist

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, thanks!

r.Log.Error(updateErr, "RayCluster update reason error", "cluster name", request.Name)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good start. we may want to expose more for other state.

}
r.Recorder.Event(instance, corev1.EventTypeWarning, string(rayiov1alpha1.PodReconciliationError), err.Error())
return ctrl.Result{RequeueAfter: DefaultRequeueDuration}, err
}
// update the status if needed
Expand Down Expand Up @@ -762,6 +767,7 @@ func (r *RayClusterReconciler) buildWorkerPod(instance rayiov1alpha1.RayCluster,
func (r *RayClusterReconciler) SetupWithManager(mgr ctrl.Manager, reconcileConcurrency int) error {
return ctrl.NewControllerManagedBy(mgr).
For(&rayiov1alpha1.RayCluster{}).Named("raycluster-controller").
WithEventFilter(predicate.Or(predicate.GenerationChangedPredicate{}, predicate.LabelChangedPredicate{}, predicate.AnnotationChangedPredicate{})).
Watches(&source.Kind{Type: &corev1.Event{}}, &handler.EnqueueRequestForObject{}).
Watches(&source.Kind{Type: &corev1.Pod{}}, &handler.EnqueueRequestForOwner{
IsController: true,
Expand Down Expand Up @@ -1049,3 +1055,8 @@ func (r *RayClusterReconciler) updateClusterState(instance *rayiov1alpha1.RayClu
instance.Status.State = clusterState
return r.Status().Update(context.Background(), instance)
}

func (r *RayClusterReconciler) updateClusterReason(instance *rayiov1alpha1.RayCluster, clusterReason string) error {
instance.Status.Reason = clusterReason
return r.Status().Update(context.Background(), instance)
}
33 changes: 33 additions & 0 deletions ray-operator/controllers/ray/raycluster_controller_fake_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,39 @@ func TestReconcile_AutoscalerRoleBinding(t *testing.T) {
assert.Nil(t, err, "Fail to get autoscaler RoleBinding after reconciliation")
}

func TestReconcile_UpdateClusterReason(t *testing.T) {
setupTest(t)
defer tearDown(t)
newScheme := runtime.NewScheme()
_ = rayiov1alpha1.AddToScheme(newScheme)

fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(testRayCluster).Build()

namespacedName := types.NamespacedName{
Name: instanceName,
Namespace: namespaceStr,
}
cluster := rayiov1alpha1.RayCluster{}
err := fakeClient.Get(context.Background(), namespacedName, &cluster)
assert.Nil(t, err, "Fail to get RayCluster")
assert.Empty(t, cluster.Status.Reason, "Cluster reason should be empty")

testRayClusterReconciler := &RayClusterReconciler{
Client: fakeClient,
Recorder: &record.FakeRecorder{},
Scheme: scheme.Scheme,
Log: ctrl.Log.WithName("controllers").WithName("RayCluster"),
}
reason := "test reason"

err = testRayClusterReconciler.updateClusterReason(testRayCluster, reason)
assert.Nil(t, err, "Fail to update cluster reason")

err = fakeClient.Get(context.Background(), namespacedName, &cluster)
assert.Nil(t, err, "Fail to get RayCluster after updating reason")
assert.Equal(t, cluster.Status.Reason, reason, "Cluster reason should be updated")
}

func TestUpdateEndpoints(t *testing.T) {
setupTest(t)
defer tearDown(t)
Expand Down