Skip to content

Commit 80c3ee5

Browse files
authored
fix: only filter RayCluster events for reconciliation (ray-project#882)
ray-project#639 accidentally applied event filters for child resources Pods and Services. This change does not filter Pod or Service related events. This means Pod updates will trigger RayCluster reconciliation.
1 parent de0aaf5 commit 80c3ee5

5 files changed

Lines changed: 78 additions & 21 deletions

File tree

ray-operator/controllers/ray/common/constant.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ const (
8383
RAY_SERVE_KV_TIMEOUT_S = "RAY_SERVE_KV_TIMEOUT_S"
8484
SERVE_CONTROLLER_PIN_ON_NODE = "RAY_INTERNAL_SERVE_CONTROLLER_PIN_ON_NODE"
8585
RAY_USAGE_STATS_KUBERAY_IN_USE = "RAY_USAGE_STATS_KUBERAY_IN_USE"
86+
RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV = "RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV"
87+
RAYCLUSTER_DEFAULT_REQUEUE_SECONDS = 300
8688

8789
// Ray core default configurations
8890
DefaultRedisPassword = "5241590000000000"

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"k8s.io/apimachinery/pkg/runtime"
2929
"k8s.io/apimachinery/pkg/types"
3030
ctrl "sigs.k8s.io/controller-runtime"
31+
"sigs.k8s.io/controller-runtime/pkg/builder"
3132
"sigs.k8s.io/controller-runtime/pkg/client"
3233
controller "sigs.k8s.io/controller-runtime/pkg/controller"
3334
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
@@ -236,13 +237,13 @@ func (r *RayClusterReconciler) rayClusterReconcile(request ctrl.Request, instanc
236237
}
237238

238239
// Unconditionally requeue after the number of seconds specified in the
239-
// environment variable RAYCLUSTER_DEFAULT_RECONCILE_LOOP_S. If the
240-
// environment variable is not set, requeue after 5 minutes.
240+
// environment variable RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV. If the
241+
// environment variable is not set, requeue after the default value.
241242
var requeueAfterSeconds int
242-
requeueAfterSeconds, err := strconv.Atoi(os.Getenv("RAYCLUSTER_DEFAULT_RECONCILE_LOOP_S"))
243+
requeueAfterSeconds, err := strconv.Atoi(os.Getenv(common.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV))
243244
if err != nil {
244-
r.Log.Info("RAYCLUSTER_DEFAULT_RECONCILE_LOOP_S is not set, using default value 300s", "cluster name", request.Name)
245-
requeueAfterSeconds = 5 * 60
245+
r.Log.Info(fmt.Sprintf("Environment variable %s is not set, using default value of %d seconds", common.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV, common.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS), "cluster name", request.Name)
246+
requeueAfterSeconds = common.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS
246247
}
247248
r.Log.Info("Unconditional requeue after", "cluster name", request.Name, "seconds", requeueAfterSeconds)
248249
return ctrl.Result{RequeueAfter: time.Duration(requeueAfterSeconds) * time.Second}, nil
@@ -810,8 +811,12 @@ func (r *RayClusterReconciler) buildWorkerPod(instance rayiov1alpha1.RayCluster,
810811
// SetupWithManager builds the reconciler.
811812
func (r *RayClusterReconciler) SetupWithManager(mgr ctrl.Manager, reconcileConcurrency int) error {
812813
b := ctrl.NewControllerManagedBy(mgr).
813-
For(&rayiov1alpha1.RayCluster{}).Named("raycluster-controller").
814-
WithEventFilter(predicate.Or(predicate.GenerationChangedPredicate{}, predicate.LabelChangedPredicate{}, predicate.AnnotationChangedPredicate{})).
814+
Named("raycluster-controller").
815+
For(&rayiov1alpha1.RayCluster{}, builder.WithPredicates(predicate.Or(
816+
predicate.GenerationChangedPredicate{},
817+
predicate.LabelChangedPredicate{},
818+
predicate.AnnotationChangedPredicate{},
819+
))).
815820
Watches(&source.Kind{Type: &corev1.Event{}}, &handler.EnqueueRequestForObject{}).
816821
Owns(&corev1.Pod{}).
817822
Owns(&corev1.Service{})

ray-operator/controllers/ray/raycluster_controller_test.go

Lines changed: 55 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package ray
1818
import (
1919
"context"
2020
"fmt"
21+
"log"
2122
"reflect"
2223
"time"
2324

@@ -46,6 +47,7 @@ const (
4647
var _ = Context("Inside the default namespace", func() {
4748
ctx := context.TODO()
4849
var workerPods corev1.PodList
50+
var headPods corev1.PodList
4951
enableInTreeAutoscaling := true
5052

5153
myRayCluster := &rayiov1alpha1.RayCluster{
@@ -62,7 +64,6 @@ var _ = Context("Inside the default namespace", func() {
6264
"port": "6379",
6365
"object-manager-port": "12345",
6466
"node-manager-port": "12346",
65-
"object-store-memory": "100000000",
6667
"num-cpus": "1",
6768
},
6869
Template: corev1.PodTemplateSpec{
@@ -126,7 +127,8 @@ var _ = Context("Inside the default namespace", func() {
126127
},
127128
}
128129

129-
filterLabels := client.MatchingLabels{common.RayClusterLabelKey: myRayCluster.Name, common.RayNodeGroupLabelKey: "small-group"}
130+
headFilterLabels := client.MatchingLabels{common.RayClusterLabelKey: myRayCluster.Name, common.RayNodeGroupLabelKey: "headgroup"}
131+
workerFilterLabels := client.MatchingLabels{common.RayClusterLabelKey: myRayCluster.Name, common.RayNodeGroupLabelKey: "small-group"}
130132

131133
Describe("When creating a raycluster", func() {
132134
It("should create a raycluster object", func() {
@@ -150,17 +152,15 @@ var _ = Context("Inside the default namespace", func() {
150152

151153
It("should create 3 workers", func() {
152154
Eventually(
153-
listResourceFunc(ctx, &workerPods, filterLabels, &client.ListOptions{Namespace: "default"}),
155+
listResourceFunc(ctx, &workerPods, workerFilterLabels, &client.ListOptions{Namespace: "default"}),
154156
time.Second*15, time.Millisecond*500).Should(Equal(3), fmt.Sprintf("workerGroup %v", workerPods.Items))
155157
if len(workerPods.Items) > 0 {
156158
Expect(workerPods.Items[0].Status.Phase).Should(Or(Equal(corev1.PodRunning), Equal(corev1.PodPending)))
157159
}
158160
})
159161

160162
It("should create a head pod resource", func() {
161-
var headPods corev1.PodList
162-
filterLabels := client.MatchingLabels{common.RayClusterLabelKey: myRayCluster.Name, common.RayNodeGroupLabelKey: "headgroup"}
163-
err := k8sClient.List(ctx, &headPods, filterLabels, &client.ListOptions{Namespace: "default"}, client.InNamespace(myRayCluster.Namespace))
163+
err := k8sClient.List(ctx, &headPods, headFilterLabels, &client.ListOptions{Namespace: "default"}, client.InNamespace(myRayCluster.Namespace))
164164
Expect(err).NotTo(HaveOccurred(), "failed list head pods")
165165
Expect(len(headPods.Items)).Should(BeNumerically("==", 1), "My head pod list= %v", headPods.Items)
166166

@@ -190,9 +190,42 @@ var _ = Context("Inside the default namespace", func() {
190190
time.Second*15, time.Millisecond*500).Should(BeNil(), "autoscaler RoleBinding = %v", rbName)
191191
})
192192

193+
It("should be able to update all Pods to Running", func() {
194+
// We need to manually update Pod statuses otherwise they'll always be Pending.
195+
// envtest doesn't create a full K8s cluster. It's only the control plane.
196+
// There's no container runtime or any other K8s controllers.
197+
// So Pods are created, but no controller updates them from Pending to Running.
198+
// See https://book.kubebuilder.io/reference/envtest.html
199+
for _, headPod := range headPods.Items {
200+
headPod.Status.Phase = corev1.PodRunning
201+
Expect(k8sClient.Status().Update(ctx, &headPod)).Should(BeNil())
202+
}
203+
err := k8sClient.List(ctx, &headPods, headFilterLabels, &client.ListOptions{Namespace: "default"})
204+
Expect(err).ShouldNot(HaveOccurred(), "failed to list head Pods")
205+
for _, headPod := range headPods.Items {
206+
Expect(headPod.Status.Phase).Should(Equal(corev1.PodRunning))
207+
}
208+
209+
for _, workerPod := range workerPods.Items {
210+
workerPod.Status.Phase = corev1.PodRunning
211+
Expect(k8sClient.Status().Update(ctx, &workerPod)).Should(BeNil())
212+
}
213+
err = k8sClient.List(ctx, &workerPods, workerFilterLabels, &client.ListOptions{Namespace: "default"})
214+
Expect(err).ShouldNot(HaveOccurred(), "failed to list worker Pods")
215+
for _, workerPod := range workerPods.Items {
216+
Expect(workerPod.Status.Phase).Should(Equal(corev1.PodRunning))
217+
}
218+
})
219+
220+
It("cluster's .status.state should be updated to 'ready' shortly after all Pods are Running", func() {
221+
Eventually(
222+
getClusterState(ctx, "default", myRayCluster.Name),
223+
time.Second*(common.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS+5), time.Millisecond*500).Should(Equal(rayiov1alpha1.Ready))
224+
})
225+
193226
It("should re-create a deleted worker", func() {
194227
Eventually(
195-
listResourceFunc(ctx, &workerPods, filterLabels, &client.ListOptions{Namespace: "default"}),
228+
listResourceFunc(ctx, &workerPods, workerFilterLabels, &client.ListOptions{Namespace: "default"}),
196229
time.Second*15, time.Millisecond*500).Should(Equal(3), fmt.Sprintf("workerGroup %v", workerPods.Items))
197230

198231
pod := workerPods.Items[0]
@@ -203,7 +236,7 @@ var _ = Context("Inside the default namespace", func() {
203236

204237
// at least 3 pods should be in none-failed phase
205238
Eventually(
206-
listResourceFunc(ctx, &workerPods, filterLabels, &client.ListOptions{Namespace: "default"}),
239+
listResourceFunc(ctx, &workerPods, workerFilterLabels, &client.ListOptions{Namespace: "default"}),
207240
time.Second*15, time.Millisecond*500).Should(Equal(3), fmt.Sprintf("workerGroup %v", workerPods.Items))
208241
})
209242

@@ -228,7 +261,7 @@ var _ = Context("Inside the default namespace", func() {
228261
It("should have only 2 running worker", func() {
229262
// retry listing pods, given that last update may not immediately happen.
230263
Eventually(
231-
listResourceFunc(ctx, &workerPods, filterLabels, &client.ListOptions{Namespace: "default"}),
264+
listResourceFunc(ctx, &workerPods, workerFilterLabels, &client.ListOptions{Namespace: "default"}),
232265
time.Second*15, time.Millisecond*500).Should(Equal(2), fmt.Sprintf("workerGroup %v", workerPods.Items))
233266
})
234267

@@ -250,7 +283,7 @@ var _ = Context("Inside the default namespace", func() {
250283
It("should have only 1 running worker", func() {
251284
// retry listing pods, given that last update may not immediately happen.
252285
Eventually(
253-
listResourceFunc(ctx, &workerPods, filterLabels, &client.ListOptions{Namespace: "default"}),
286+
listResourceFunc(ctx, &workerPods, workerFilterLabels, &client.ListOptions{Namespace: "default"}),
254287
time.Second*15, time.Millisecond*500).Should(Equal(1), fmt.Sprintf("workerGroup %v", workerPods.Items))
255288
})
256289

@@ -275,14 +308,14 @@ var _ = Context("Inside the default namespace", func() {
275308
It("should scale to maxReplicas (4) workers", func() {
276309
// retry listing pods, given that last update may not immediately happen.
277310
Eventually(
278-
listResourceFunc(ctx, &workerPods, filterLabels, &client.ListOptions{Namespace: "default"}),
311+
listResourceFunc(ctx, &workerPods, workerFilterLabels, &client.ListOptions{Namespace: "default"}),
279312
time.Second*15, time.Millisecond*500).Should(Equal(4), fmt.Sprintf("workerGroup %v", workerPods.Items))
280313
})
281314

282315
It("should countinue to have only maxReplicas (4) workers", func() {
283316
// check that pod count stays at 4 for two seconds.
284317
Consistently(
285-
listResourceFunc(ctx, &workerPods, filterLabels, &client.ListOptions{Namespace: "default"}),
318+
listResourceFunc(ctx, &workerPods, workerFilterLabels, &client.ListOptions{Namespace: "default"}),
286319
time.Second*2, time.Millisecond*200).Should(Equal(4), fmt.Sprintf("workerGroup %v", workerPods.Items))
287320
})
288321
})
@@ -330,3 +363,13 @@ func retryOnOldRevision(attempts int, sleep time.Duration, f func() error) error
330363
}
331364
return fmt.Errorf("after %d attempts, last error: %s", attempts, err)
332365
}
366+
367+
func getClusterState(ctx context.Context, namespace string, clusterName string) func() rayiov1alpha1.ClusterState {
368+
return func() rayiov1alpha1.ClusterState {
369+
var cluster rayiov1alpha1.RayCluster
370+
if err := k8sClient.Get(ctx, client.ObjectKey{Namespace: namespace, Name: clusterName}, &cluster); err != nil {
371+
log.Fatal(err)
372+
}
373+
return cluster.Status.State
374+
}
375+
}

ray-operator/controllers/ray/rayservice_controller.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"k8s.io/apimachinery/pkg/api/errors"
2222
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2323
"k8s.io/client-go/tools/record"
24+
"sigs.k8s.io/controller-runtime/pkg/builder"
2425
"sigs.k8s.io/controller-runtime/pkg/manager"
2526
"sigs.k8s.io/controller-runtime/pkg/predicate"
2627

@@ -219,8 +220,11 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque
219220
// SetupWithManager sets up the controller with the Manager.
220221
func (r *RayServiceReconciler) SetupWithManager(mgr ctrl.Manager) error {
221222
return ctrl.NewControllerManagedBy(mgr).
222-
For(&rayv1alpha1.RayService{}).
223-
WithEventFilter(predicate.Or(predicate.GenerationChangedPredicate{}, predicate.LabelChangedPredicate{}, predicate.AnnotationChangedPredicate{})).
223+
For(&rayv1alpha1.RayService{}, builder.WithPredicates(predicate.Or(
224+
predicate.GenerationChangedPredicate{},
225+
predicate.LabelChangedPredicate{},
226+
predicate.AnnotationChangedPredicate{},
227+
))).
224228
Owns(&rayv1alpha1.RayCluster{}).
225229
Owns(&corev1.Service{}).
226230
Owns(&networkingv1.Ingress{}).

ray-operator/controllers/ray/suite_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,11 @@ limitations under the License.
1616
package ray
1717

1818
import (
19+
"os"
1920
"path/filepath"
2021
"testing"
2122

23+
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
2224
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
2325

2426
"sigs.k8s.io/controller-runtime/pkg/envtest/printer"
@@ -79,6 +81,7 @@ var _ = BeforeSuite(func(done Done) {
7981
Expect(k8sClient).ToNot(BeNil())
8082

8183
// Suggested way to run tests
84+
os.Setenv(common.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV, "10")
8285
mgr, err := ctrl.NewManager(cfg, ctrl.Options{
8386
Scheme: scheme.Scheme,
8487
MetricsBindAddress: "0",

0 commit comments

Comments
 (0)