InftyAI · InftyAI-Agent · Sep 11, 2024 · Sep 11, 2024
diff --git a/api/core/v1alpha1/model_types.go b/api/core/v1alpha1/model_types.go
@@ -181,7 +181,7 @@ type ModelStatus struct {
 //+genclient
 //+kubebuilder:object:root=true
 //+kubebuilder:subresource:status
-//+kubebuilder:resource:scope=Cluster
+//+kubebuilder:resource:shortName=om,scope=Cluster
 
 // OpenModel is the Schema for the open models API
 type OpenModel struct {

diff --git a/api/inference/v1alpha1/backendruntime_types.go b/api/inference/v1alpha1/backendruntime_types.go
@@ -24,8 +24,8 @@ import (
 type InferenceMode string
 
 const (
-	DefaultInferenceMode             InferenceMode = "default"
-	SpeculativeDecodingInferenceMode InferenceMode = "speculative-decoding"
+	DefaultInferenceMode             InferenceMode = "Default"
+	SpeculativeDecodingInferenceMode InferenceMode = "SpeculativeDecoding"
 )
 
 type BackendRuntimeArg struct {
@@ -47,6 +47,7 @@ type BackendRuntimeSpec struct {
 	// They can be appended or overwritten by the Playground args.
 	// The key is the inference option, like default one or advanced
 	// speculativeDecoding, the values are the corresponding args.
+	// Flag around with {{ .XXX }} is a flag waiting for render.
 	Args []BackendRuntimeArg `json:"args,omitempty"`
 	// Envs represents the environments set to the container.
 	// +optional
@@ -65,7 +66,7 @@ type BackendRuntimeStatus struct {
 
 //+kubebuilder:object:root=true
 //+kubebuilder:subresource:status
-//+kubebuilder:resource:scope=Cluster
+//+kubebuilder:resource:shortName=br,scope=Cluster
 
 // BackendRuntime is the Schema for the backendRuntime API
 type BackendRuntime struct {

diff --git a/api/inference/v1alpha1/config_types.go b/api/inference/v1alpha1/config_types.go
@@ -30,7 +30,6 @@ const (
 
 type BackendRuntimeConfig struct {
 	// Name represents the inference backend under the hood, e.g. vLLM.
-	// +kubebuilder:validation:Enum={vllm,sglang,llamacpp}
 	// +kubebuilder:default=vllm
 	// +optional
 	Name *BackendName `json:"name,omitempty"`

diff --git a/api/inference/v1alpha1/playground_types.go b/api/inference/v1alpha1/playground_types.go
@@ -61,6 +61,7 @@ type PlaygroundStatus struct {
 //+genclient
 //+kubebuilder:object:root=true
 //+kubebuilder:subresource:status
+//+kubebuilder:resource:shortName={pl}
 
 // Playground is the Schema for the playgrounds API
 type Playground struct {

diff --git a/config/crd/bases/inference.llmaz.io_backendruntimes.yaml b/config/crd/bases/inference.llmaz.io_backendruntimes.yaml
@@ -11,6 +11,8 @@ spec:
     kind: BackendRuntime
     listKind: BackendRuntimeList
     plural: backendruntimes
+    shortNames:
+    - br
     singular: backendruntime
   scope: Cluster
   versions:
@@ -45,6 +47,7 @@ spec:
                   They can be appended or overwritten by the Playground args.
                   The key is the inference option, like default one or advanced
                   speculativeDecoding, the values are the corresponding args.
+                  Flag around with {{ .XXX }} is a flag waiting for render.
                 items:
                   properties:
                     flags:

diff --git a/config/crd/bases/inference.llmaz.io_playgrounds.yaml b/config/crd/bases/inference.llmaz.io_playgrounds.yaml
@@ -11,6 +11,8 @@ spec:
     kind: Playground
     listKind: PlaygroundList
     plural: playgrounds
+    shortNames:
+    - pl
     singular: playground
   scope: Namespaced
   versions:
@@ -179,10 +181,6 @@ spec:
                     default: vllm
                     description: Name represents the inference backend under the hood,
                       e.g. vLLM.
-                    enum:
-                    - vllm
-                    - sglang
-                    - llamacpp
                     type: string
                   resources:
                     description: |-

diff --git a/config/crd/bases/llmaz.io_openmodels.yaml b/config/crd/bases/llmaz.io_openmodels.yaml
@@ -11,6 +11,8 @@ spec:
     kind: OpenModel
     listKind: OpenModelList
     plural: openmodels
+    shortNames:
+    - om
     singular: openmodel
   scope: Cluster
   versions:

diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
@@ -34,7 +34,7 @@ rules:
 - apiGroups:
   - inference.llmaz.io
   resources:
-  - backends
+  - backendruntimes
   verbs:
   - create
   - delete
@@ -46,13 +46,13 @@ rules:
 - apiGroups:
   - inference.llmaz.io
   resources:
-  - backends/finalizers
+  - backendruntimes/finalizers
   verbs:
   - update
 - apiGroups:
   - inference.llmaz.io
   resources:
-  - backends/status
+  - backendruntimes/status
   verbs:
   - get
   - patch

diff --git a/docs/assets/arch.png b/docs/assets/arch.png
diff --git a/docs/examples/llamacpp/playground.yaml b/docs/examples/llamacpp/playground.yaml
@@ -6,7 +6,7 @@ spec:
   replicas: 1
   modelClaim:
     modelName: qwen2-0--5b-gguf
-  backendConfig:
+  backendRuntimeConfig:
     name: llamacpp
     args:
       - -fa # use flash attention
diff --git a/docs/examples/sglang/playground.yaml b/docs/examples/sglang/playground.yaml
@@ -6,5 +6,5 @@ spec:
   replicas: 1
   modelClaim:
     modelName: qwen2-05b
-  backendConfig:
+  backendRuntimeConfig:
     name: sglang
diff --git a/docs/examples/speculative-decoding/llamacpp/playground.yaml b/docs/examples/speculative-decoding/llamacpp/playground.yaml
@@ -13,7 +13,7 @@ spec:
         role: main
       - name: llama2-7b-q2-k-gguf  # the draft model
         role: draft
-  backendConfig:
+  backendRuntimeConfig:
     name: llamacpp
     args:
       - -fa # use flash attention

diff --git a/docs/examples/speculative-decoding/vllm/playground.yaml b/docs/examples/speculative-decoding/vllm/playground.yaml
@@ -10,13 +10,7 @@ spec:
         role: main
       - name: opt-125m  # the draft model
         role: draft
-  backendConfig:
-    args:
-      - --use-v2-block-manager
-      - --num_speculative_tokens
-      - "5"
-      - -tp
-      - "1"
+  backendRuntimeConfig:
     resources:
       limits:
         cpu: 8

diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go
@@ -44,7 +44,7 @@ import (
 	inferenceapi "github.com/inftyai/llmaz/api/inference/v1alpha1"
 	coreclientgo "github.com/inftyai/llmaz/client-go/applyconfiguration/core/v1alpha1"
 	inferenceclientgo "github.com/inftyai/llmaz/client-go/applyconfiguration/inference/v1alpha1"
-	"github.com/inftyai/llmaz/pkg/controller_helper/backend"
+	helper "github.com/inftyai/llmaz/pkg/controller_helper"
 	modelSource "github.com/inftyai/llmaz/pkg/controller_helper/model_source"
 	"github.com/inftyai/llmaz/pkg/util"
 )
@@ -94,32 +94,27 @@ func (r *PlaygroundReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		}
 	}
 
-	var serviceApplyConfiguration *inferenceclientgo.ServiceApplyConfiguration
-
-	models := []*coreapi.OpenModel{}
-	if playground.Spec.ModelClaim != nil {
-		model := &coreapi.OpenModel{}
-		if err := r.Get(ctx, types.NamespacedName{Name: string(playground.Spec.ModelClaim.ModelName)}, model); err != nil {
-			if apierrors.IsNotFound(err) && handleUnexpectedCondition(playground, false, false) {
-				return ctrl.Result{}, r.Client.Status().Update(ctx, playground)
-			}
-			return ctrl.Result{}, err
-		}
-		models = append(models, model)
-	} else if playground.Spec.ModelClaims != nil {
-		for _, mr := range playground.Spec.ModelClaims.Models {
-			model := &coreapi.OpenModel{}
-			if err := r.Get(ctx, types.NamespacedName{Name: string(mr.Name)}, model); err != nil {
-				if apierrors.IsNotFound(err) && handleUnexpectedCondition(playground, false, false) {
-					return ctrl.Result{}, r.Client.Status().Update(ctx, playground)
-				}
-				return ctrl.Result{}, err
-			}
-			models = append(models, model)
+	models, err := helper.FetchModelsByPlayground(ctx, r.Client, playground)
+	if err != nil {
+		if apierrors.IsNotFound(err) && handleUnexpectedCondition(playground, false, false) {
+			return ctrl.Result{}, r.Client.Status().Update(ctx, playground)
 		}
+		return ctrl.Result{}, err
 	}
 
-	serviceApplyConfiguration = buildServiceApplyConfiguration(models, playground)
+	backendRuntimeName := inferenceapi.VLLM
+	if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Name != nil {
+		backendRuntimeName = *playground.Spec.BackendRuntimeConfig.Name
+	}
+	backendRuntime := &inferenceapi.BackendRuntime{}
+	if err := r.Get(ctx, types.NamespacedName{Name: string(backendRuntimeName)}, backendRuntime); err != nil {
+		return ctrl.Result{}, err
+	}
+
+	serviceApplyConfiguration, err := buildServiceApplyConfiguration(models, playground, backendRuntime)
+	if err != nil {
+		return ctrl.Result{}, err
+	}
 
 	if err := setControllerReferenceForService(playground, serviceApplyConfiguration, r.Scheme); err != nil {
 		return ctrl.Result{}, err
@@ -185,19 +180,19 @@ func (r *PlaygroundReconciler) SetupWithManager(mgr ctrl.Manager) error {
 		Complete(r)
 }
 
-func buildServiceApplyConfiguration(models []*coreapi.OpenModel, playground *inferenceapi.Playground) *inferenceclientgo.ServiceApplyConfiguration {
+func buildServiceApplyConfiguration(models []*coreapi.OpenModel, playground *inferenceapi.Playground, backendRuntime *inferenceapi.BackendRuntime) (*inferenceclientgo.ServiceApplyConfiguration, error) {
 	// Build metadata
 	serviceApplyConfiguration := inferenceclientgo.Service(playground.Name, playground.Namespace)
 
 	// Build spec.
 	spec := inferenceclientgo.ServiceSpec()
 
-	claim := &coreclientgo.ModelClaimsApplyConfiguration{}
+	var claim *coreclientgo.ModelClaimsApplyConfiguration
 	if playground.Spec.ModelClaim != nil {
 		claim = coreclientgo.ModelClaims().
 			WithModels(coreclientgo.ModelRepresentative().WithName(playground.Spec.ModelClaim.ModelName).WithRole(coreapi.MainRole)).
 			WithInferenceFlavors(playground.Spec.ModelClaim.InferenceFlavors...)
-	} else if playground.Spec.ModelClaims != nil {
+	} else {
 		mrs := []*coreclientgo.ModelRepresentativeApplyConfiguration{}
 		for _, model := range playground.Spec.ModelClaims.Models {
 			role := coreapi.MainRole
@@ -214,10 +209,15 @@ func buildServiceApplyConfiguration(models []*coreapi.OpenModel, playground *inf
 	}
 
 	spec.WithModelClaims(claim)
-	spec.WithWorkloadTemplate(buildWorkloadTemplate(models, playground))
+	template, err := buildWorkloadTemplate(models, playground, backendRuntime)
+	if err != nil {
+		return nil, err
+	}
+
+	spec.WithWorkloadTemplate(template)
 	serviceApplyConfiguration.WithSpec(spec)
 
-	return serviceApplyConfiguration
+	return serviceApplyConfiguration, nil
 
 	// TODO: handle MultiModelsClaims in the future.
 }
@@ -226,7 +226,7 @@ func buildServiceApplyConfiguration(models []*coreapi.OpenModel, playground *inf
 // to cover both single-host and multi-host cases. There're some shortages for lws like can not force rolling
 // update when one replica failed, we'll fix this in the kubernetes upstream.
 // Model flavors will not be considered but in inferenceService controller to support accelerator fungibility.
-func buildWorkloadTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playground) lws.LeaderWorkerSetSpec {
+func buildWorkloadTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playground, backendRuntime *inferenceapi.BackendRuntime) (lws.LeaderWorkerSetSpec, error) {
 	// TODO: this should be leaderWorkerSetTemplateSpec, we should support in the lws upstream.
 	workload := lws.LeaderWorkerSetSpec{
 		// Use the default policy defined in lws.
@@ -240,52 +240,36 @@ func buildWorkloadTemplate(models []*coreapi.OpenModel, playground *inferenceapi
 
 	// TODO: handle multi-host scenarios, e.g. nvidia.com/gpu: 32, means we'll split into 4 hosts.
 	// Do we need another configuration for playground for multi-host use case? I guess no currently.
-	workload.LeaderWorkerTemplate.WorkerTemplate = buildWorkerTemplate(models, playground)
-
-	return workload
-}
-
-func involveRole(playground *inferenceapi.Playground) coreapi.ModelRole {
-	if playground.Spec.ModelClaim != nil {
-		return coreapi.MainRole
-	} else if playground.Spec.ModelClaims != nil {
-		for _, mr := range playground.Spec.ModelClaims.Models {
-			if *mr.Role != coreapi.MainRole {
-				return *mr.Role
-			}
-		}
+	template, err := buildWorkerTemplate(models, playground, backendRuntime)
+	if err != nil {
+		return lws.LeaderWorkerSetSpec{}, err
 	}
+	workload.LeaderWorkerTemplate.WorkerTemplate = template
 
-	return coreapi.MainRole
+	return workload, nil
 }
 
-func buildWorkerTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playground) corev1.PodTemplateSpec {
-	backendName := inferenceapi.DefaultBackend
-	if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Name != nil {
-		backendName = *playground.Spec.BackendRuntimeConfig.Name
-	}
-	bkd := backend.SwitchBackend(backendName)
+func buildWorkerTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playground, backendRuntime *inferenceapi.BackendRuntime) (corev1.PodTemplateSpec, error) {
+	parser := helper.NewBackendRuntimeParser(backendRuntime)
 
-	version := bkd.DefaultVersion()
-	if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Version != nil {
-		version = *playground.Spec.BackendRuntimeConfig.Version
+	args, err := parser.Args(helper.InferenceMode(playground), models)
+	if err != nil {
+		return corev1.PodTemplateSpec{}, err
 	}
+	envs := parser.Envs()
 
-	args := bkd.Args(models, involveRole(playground))
-
-	var envs []corev1.EnvVar
 	if playground.Spec.BackendRuntimeConfig != nil {
 		args = append(args, playground.Spec.BackendRuntimeConfig.Args...)
-		envs = playground.Spec.BackendRuntimeConfig.Envs
+		envs = append(envs, playground.Spec.BackendRuntimeConfig.Envs...)
 	}
 
 	resources := corev1.ResourceRequirements{
-		Limits:   bkd.DefaultResources().Limits,
-		Requests: bkd.DefaultResources().Requests,
+		Requests: parser.Resources().Requests,
+		Limits:   parser.Resources().Limits,
 	}
 	if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Resources != nil {
-		limits := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Limits, resources.Limits)
-		requests := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Requests, resources.Requests)
+		limits := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Limits, parser.Resources().Limits)
+		requests := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Requests, parser.Resources().Requests)
 
 		resources = corev1.ResourceRequirements{
 			Limits:   limits,
@@ -302,16 +286,21 @@ func buildWorkerTemplate(models []*coreapi.OpenModel, playground *inferenceapi.P
 		}
 	}
 
+	version := parser.Version()
+	if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Version != nil {
+		version = *playground.Spec.BackendRuntimeConfig.Version
+	}
+
 	template := corev1.PodTemplateSpec{
 		Spec: corev1.PodSpec{
 			// TODO: should we support image pull secret here?
 			// TODO: support readiness/liveness
 			Containers: []corev1.Container{
 				{
 					Name:      modelSource.MODEL_RUNNER_CONTAINER_NAME,
-					Image:     bkd.Image(version),
+					Image:     parser.Image(version),
 					Resources: resources,
-					Command:   bkd.DefaultCommand(),
+					Command:   parser.Commands(),
 					Args:      args,
 					Env:       envs,
 					Ports: []corev1.ContainerPort{
@@ -326,7 +315,7 @@ func buildWorkerTemplate(models []*coreapi.OpenModel, playground *inferenceapi.P
 		},
 	}
 
-	return template
+	return template, nil
 }
 
 func handleUnexpectedCondition(playground *inferenceapi.Playground, modelExists bool, serviceWithSameNameExists bool) (changed bool) {