Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions config/rbac/controller-manager/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -308,3 +308,10 @@ rules:
- list
- update
- watch
- apiGroups:
- apiextensions.k8s.io
resources:
- customresourcedefinitions
verbs:
- get
- list
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,22 @@ import (
"sync"
"time"

"github.com/vllm-project/aibrix/pkg/controller/util/expectation"
apierrors "k8s.io/apimachinery/pkg/api/errors"

rayclusterv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
orchestrationv1alpha1 "github.com/vllm-project/aibrix/api/orchestration/v1alpha1"
"github.com/vllm-project/aibrix/pkg/config"
"k8s.io/client-go/tools/record"
"k8s.io/klog/v2"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"github.com/vllm-project/aibrix/pkg/controller/util/expectation"

orchestrationv1alpha1 "github.com/vllm-project/aibrix/api/orchestration/v1alpha1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

var (
Expand All @@ -45,11 +46,16 @@ var (
controllerKind = orchestrationv1alpha1.GroupVersion.WithKind("RayClusterReplicaSet")
)

// Add creates a new RayClusterReplicaSet Controller and adds it to the Manager with default RBAC.
// Add first validates that the required Ray CRD (e.g., "rayclusters.ray.io") exists in the cluster.
// If the CRD is not found, the function fails early with an error.
// If the CRD exists, this function creates a new RayClusterReplicaSet Controller and adds it to the Manager with default RBAC.
// The Manager will set fields on the Controller and Start it when the Manager is Started.
func Add(mgr manager.Manager, runtimeConfig config.RuntimeConfig) error {
// TODO: check crd exists or not. If not, we should fail here directly without moving forward.
// This is used to validate whether kuberay is installed now.
// Check if the CRD exists. If not, fail directly.
crdName := "rayclusters.ray.io"
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a hard code, not sure if it is good enough

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is ok for short term but better to use scheme to construct it

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤔 In addition to the ray dependency, do we also need to do the same for the EnvoyGateway object?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you find anywhere suitable for the EnvoyGateway object check? Without it, the load balancer won’t be populated. Adding checks could help catch the issue earlier, but the failure would still be visible even without them.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you find anywhere suitable for the EnvoyGateway object check? Without it, the load balancer won’t be populated. Adding checks could help catch the issue earlier, but the failure would still be visible even without them.

Sounds reasonable, we can find this problem again when creating envoy-gateway-config. 🤔

if err := checkCRDExists(mgr.GetClient(), crdName); err != nil {
return fmt.Errorf("failed to validate CRD: %v", err)
}

r, err := newReconciler(mgr, runtimeConfig)
if err != nil {
Expand All @@ -58,6 +64,29 @@ func Add(mgr manager.Manager, runtimeConfig config.RuntimeConfig) error {
return add(mgr, r)
}

// checkCRDExists checks if the specified CRD exists in the cluster.
func checkCRDExists(c client.Client, crdName string) error {
gvk := schema.GroupVersionKind{
Group: "apiextensions.k8s.io",
Version: "v1",
Kind: "CustomResourceDefinition",
}

// Create an unstructured object to represent the CRD.
crd := &unstructured.Unstructured{}
crd.SetGroupVersionKind(gvk)
crd.SetName(crdName)

err := c.Get(context.TODO(), client.ObjectKey{Name: crdName}, crd)
if err != nil {
if apierrors.IsNotFound(err) {
return fmt.Errorf("CRD %q not found. Please ensure %q is installed", crdName, crdName)
}
return fmt.Errorf("error checking CRD %q: %v", crdName, err)
}
return nil
}

// newReconciler returns a new reconcile.Reconciler
func newReconciler(mgr manager.Manager, runtimeConfig config.RuntimeConfig) (reconcile.Reconciler, error) {
reconciler := &RayClusterReplicaSetReconciler{
Expand Down
Loading