Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/core/v1alpha1/model_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ type ModelStatus struct {
//+genclient
//+kubebuilder:object:root=true
//+kubebuilder:subresource:status
//+kubebuilder:resource:scope=Cluster
//+kubebuilder:resource:shortName=om,scope=Cluster

// OpenModel is the Schema for the open models API
type OpenModel struct {
Expand Down
7 changes: 4 additions & 3 deletions api/inference/v1alpha1/backendruntime_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ import (
type InferenceMode string

const (
DefaultInferenceMode InferenceMode = "default"
SpeculativeDecodingInferenceMode InferenceMode = "speculative-decoding"
DefaultInferenceMode InferenceMode = "Default"
SpeculativeDecodingInferenceMode InferenceMode = "SpeculativeDecoding"
)

type BackendRuntimeArg struct {
Expand All @@ -47,6 +47,7 @@ type BackendRuntimeSpec struct {
// They can be appended or overwritten by the Playground args.
// The key is the inference option, like default one or advanced
// speculativeDecoding, the values are the corresponding args.
// Flag around with {{ .XXX }} is a flag waiting for render.
Args []BackendRuntimeArg `json:"args,omitempty"`
// Envs represents the environments set to the container.
// +optional
Expand All @@ -65,7 +66,7 @@ type BackendRuntimeStatus struct {

//+kubebuilder:object:root=true
//+kubebuilder:subresource:status
//+kubebuilder:resource:scope=Cluster
//+kubebuilder:resource:shortName=br,scope=Cluster

// BackendRuntime is the Schema for the backendRuntime API
type BackendRuntime struct {
Expand Down
1 change: 0 additions & 1 deletion api/inference/v1alpha1/config_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ const (

type BackendRuntimeConfig struct {
// Name represents the inference backend under the hood, e.g. vLLM.
// +kubebuilder:validation:Enum={vllm,sglang,llamacpp}
// +kubebuilder:default=vllm
// +optional
Name *BackendName `json:"name,omitempty"`
Expand Down
1 change: 1 addition & 0 deletions api/inference/v1alpha1/playground_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ type PlaygroundStatus struct {
//+genclient
//+kubebuilder:object:root=true
//+kubebuilder:subresource:status
//+kubebuilder:resource:shortName={pl}

// Playground is the Schema for the playgrounds API
type Playground struct {
Expand Down
3 changes: 3 additions & 0 deletions config/crd/bases/inference.llmaz.io_backendruntimes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ spec:
kind: BackendRuntime
listKind: BackendRuntimeList
plural: backendruntimes
shortNames:
- br
singular: backendruntime
scope: Cluster
versions:
Expand Down Expand Up @@ -45,6 +47,7 @@ spec:
They can be appended or overwritten by the Playground args.
The key is the inference option, like default one or advanced
speculativeDecoding, the values are the corresponding args.
Flag around with {{ .XXX }} is a flag waiting for render.
items:
properties:
flags:
Expand Down
6 changes: 2 additions & 4 deletions config/crd/bases/inference.llmaz.io_playgrounds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ spec:
kind: Playground
listKind: PlaygroundList
plural: playgrounds
shortNames:
- pl
singular: playground
scope: Namespaced
versions:
Expand Down Expand Up @@ -179,10 +181,6 @@ spec:
default: vllm
description: Name represents the inference backend under the hood,
e.g. vLLM.
enum:
- vllm
- sglang
- llamacpp
type: string
resources:
description: |-
Expand Down
2 changes: 2 additions & 0 deletions config/crd/bases/llmaz.io_openmodels.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ spec:
kind: OpenModel
listKind: OpenModelList
plural: openmodels
shortNames:
- om
singular: openmodel
scope: Cluster
versions:
Expand Down
6 changes: 3 additions & 3 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ rules:
- apiGroups:
- inference.llmaz.io
resources:
- backends
- backendruntimes
verbs:
- create
- delete
Expand All @@ -46,13 +46,13 @@ rules:
- apiGroups:
- inference.llmaz.io
resources:
- backends/finalizers
- backendruntimes/finalizers
verbs:
- update
- apiGroups:
- inference.llmaz.io
resources:
- backends/status
- backendruntimes/status
verbs:
- get
- patch
Expand Down
Binary file modified docs/assets/arch.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion docs/examples/llamacpp/playground.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ spec:
replicas: 1
modelClaim:
modelName: qwen2-0--5b-gguf
backendConfig:
backendRuntimeConfig:
name: llamacpp
args:
- -fa # use flash attention
2 changes: 1 addition & 1 deletion docs/examples/sglang/playground.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ spec:
replicas: 1
modelClaim:
modelName: qwen2-05b
backendConfig:
backendRuntimeConfig:
name: sglang
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ spec:
role: main
- name: llama2-7b-q2-k-gguf # the draft model
role: draft
backendConfig:
backendRuntimeConfig:
name: llamacpp
args:
- -fa # use flash attention
Expand Down
8 changes: 1 addition & 7 deletions docs/examples/speculative-decoding/vllm/playground.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,7 @@ spec:
role: main
- name: opt-125m # the draft model
role: draft
backendConfig:
args:
- --use-v2-block-manager
- --num_speculative_tokens
- "5"
- -tp
- "1"
backendRuntimeConfig:
resources:
limits:
cpu: 8
Expand Down
119 changes: 54 additions & 65 deletions pkg/controller/inference/playground_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ import (
inferenceapi "github.com/inftyai/llmaz/api/inference/v1alpha1"
coreclientgo "github.com/inftyai/llmaz/client-go/applyconfiguration/core/v1alpha1"
inferenceclientgo "github.com/inftyai/llmaz/client-go/applyconfiguration/inference/v1alpha1"
"github.com/inftyai/llmaz/pkg/controller_helper/backend"
helper "github.com/inftyai/llmaz/pkg/controller_helper"
modelSource "github.com/inftyai/llmaz/pkg/controller_helper/model_source"
"github.com/inftyai/llmaz/pkg/util"
)
Expand Down Expand Up @@ -94,32 +94,27 @@ func (r *PlaygroundReconciler) Reconcile(ctx context.Context, req ctrl.Request)
}
}

var serviceApplyConfiguration *inferenceclientgo.ServiceApplyConfiguration

models := []*coreapi.OpenModel{}
if playground.Spec.ModelClaim != nil {
model := &coreapi.OpenModel{}
if err := r.Get(ctx, types.NamespacedName{Name: string(playground.Spec.ModelClaim.ModelName)}, model); err != nil {
if apierrors.IsNotFound(err) && handleUnexpectedCondition(playground, false, false) {
return ctrl.Result{}, r.Client.Status().Update(ctx, playground)
}
return ctrl.Result{}, err
}
models = append(models, model)
} else if playground.Spec.ModelClaims != nil {
for _, mr := range playground.Spec.ModelClaims.Models {
model := &coreapi.OpenModel{}
if err := r.Get(ctx, types.NamespacedName{Name: string(mr.Name)}, model); err != nil {
if apierrors.IsNotFound(err) && handleUnexpectedCondition(playground, false, false) {
return ctrl.Result{}, r.Client.Status().Update(ctx, playground)
}
return ctrl.Result{}, err
}
models = append(models, model)
models, err := helper.FetchModelsByPlayground(ctx, r.Client, playground)
if err != nil {
if apierrors.IsNotFound(err) && handleUnexpectedCondition(playground, false, false) {
return ctrl.Result{}, r.Client.Status().Update(ctx, playground)
}
return ctrl.Result{}, err
}

serviceApplyConfiguration = buildServiceApplyConfiguration(models, playground)
backendRuntimeName := inferenceapi.VLLM
if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Name != nil {
backendRuntimeName = *playground.Spec.BackendRuntimeConfig.Name
}
backendRuntime := &inferenceapi.BackendRuntime{}
if err := r.Get(ctx, types.NamespacedName{Name: string(backendRuntimeName)}, backendRuntime); err != nil {
return ctrl.Result{}, err
}

serviceApplyConfiguration, err := buildServiceApplyConfiguration(models, playground, backendRuntime)
if err != nil {
return ctrl.Result{}, err
}

if err := setControllerReferenceForService(playground, serviceApplyConfiguration, r.Scheme); err != nil {
return ctrl.Result{}, err
Expand Down Expand Up @@ -185,19 +180,19 @@ func (r *PlaygroundReconciler) SetupWithManager(mgr ctrl.Manager) error {
Complete(r)
}

func buildServiceApplyConfiguration(models []*coreapi.OpenModel, playground *inferenceapi.Playground) *inferenceclientgo.ServiceApplyConfiguration {
func buildServiceApplyConfiguration(models []*coreapi.OpenModel, playground *inferenceapi.Playground, backendRuntime *inferenceapi.BackendRuntime) (*inferenceclientgo.ServiceApplyConfiguration, error) {
// Build metadata
serviceApplyConfiguration := inferenceclientgo.Service(playground.Name, playground.Namespace)

// Build spec.
spec := inferenceclientgo.ServiceSpec()

claim := &coreclientgo.ModelClaimsApplyConfiguration{}
var claim *coreclientgo.ModelClaimsApplyConfiguration
if playground.Spec.ModelClaim != nil {
claim = coreclientgo.ModelClaims().
WithModels(coreclientgo.ModelRepresentative().WithName(playground.Spec.ModelClaim.ModelName).WithRole(coreapi.MainRole)).
WithInferenceFlavors(playground.Spec.ModelClaim.InferenceFlavors...)
} else if playground.Spec.ModelClaims != nil {
} else {
mrs := []*coreclientgo.ModelRepresentativeApplyConfiguration{}
for _, model := range playground.Spec.ModelClaims.Models {
role := coreapi.MainRole
Expand All @@ -214,10 +209,15 @@ func buildServiceApplyConfiguration(models []*coreapi.OpenModel, playground *inf
}

spec.WithModelClaims(claim)
spec.WithWorkloadTemplate(buildWorkloadTemplate(models, playground))
template, err := buildWorkloadTemplate(models, playground, backendRuntime)
if err != nil {
return nil, err
}

spec.WithWorkloadTemplate(template)
serviceApplyConfiguration.WithSpec(spec)

return serviceApplyConfiguration
return serviceApplyConfiguration, nil

// TODO: handle MultiModelsClaims in the future.
}
Expand All @@ -226,7 +226,7 @@ func buildServiceApplyConfiguration(models []*coreapi.OpenModel, playground *inf
// to cover both single-host and multi-host cases. There're some shortages for lws like can not force rolling
// update when one replica failed, we'll fix this in the kubernetes upstream.
// Model flavors will not be considered but in inferenceService controller to support accelerator fungibility.
func buildWorkloadTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playground) lws.LeaderWorkerSetSpec {
func buildWorkloadTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playground, backendRuntime *inferenceapi.BackendRuntime) (lws.LeaderWorkerSetSpec, error) {
// TODO: this should be leaderWorkerSetTemplateSpec, we should support in the lws upstream.
workload := lws.LeaderWorkerSetSpec{
// Use the default policy defined in lws.
Expand All @@ -240,52 +240,36 @@ func buildWorkloadTemplate(models []*coreapi.OpenModel, playground *inferenceapi

// TODO: handle multi-host scenarios, e.g. nvidia.com/gpu: 32, means we'll split into 4 hosts.
// Do we need another configuration for playground for multi-host use case? I guess no currently.
workload.LeaderWorkerTemplate.WorkerTemplate = buildWorkerTemplate(models, playground)

return workload
}

func involveRole(playground *inferenceapi.Playground) coreapi.ModelRole {
if playground.Spec.ModelClaim != nil {
return coreapi.MainRole
} else if playground.Spec.ModelClaims != nil {
for _, mr := range playground.Spec.ModelClaims.Models {
if *mr.Role != coreapi.MainRole {
return *mr.Role
}
}
template, err := buildWorkerTemplate(models, playground, backendRuntime)
if err != nil {
return lws.LeaderWorkerSetSpec{}, err
}
workload.LeaderWorkerTemplate.WorkerTemplate = template

return coreapi.MainRole
return workload, nil
}

func buildWorkerTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playground) corev1.PodTemplateSpec {
backendName := inferenceapi.DefaultBackend
if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Name != nil {
backendName = *playground.Spec.BackendRuntimeConfig.Name
}
bkd := backend.SwitchBackend(backendName)
func buildWorkerTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playground, backendRuntime *inferenceapi.BackendRuntime) (corev1.PodTemplateSpec, error) {
parser := helper.NewBackendRuntimeParser(backendRuntime)

version := bkd.DefaultVersion()
if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Version != nil {
version = *playground.Spec.BackendRuntimeConfig.Version
args, err := parser.Args(helper.InferenceMode(playground), models)
if err != nil {
return corev1.PodTemplateSpec{}, err
}
envs := parser.Envs()

args := bkd.Args(models, involveRole(playground))

var envs []corev1.EnvVar
if playground.Spec.BackendRuntimeConfig != nil {
args = append(args, playground.Spec.BackendRuntimeConfig.Args...)
envs = playground.Spec.BackendRuntimeConfig.Envs
envs = append(envs, playground.Spec.BackendRuntimeConfig.Envs...)
}

resources := corev1.ResourceRequirements{
Limits: bkd.DefaultResources().Limits,
Requests: bkd.DefaultResources().Requests,
Requests: parser.Resources().Requests,
Limits: parser.Resources().Limits,
}
if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Resources != nil {
limits := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Limits, resources.Limits)
requests := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Requests, resources.Requests)
limits := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Limits, parser.Resources().Limits)
requests := util.MergeResources(playground.Spec.BackendRuntimeConfig.Resources.Requests, parser.Resources().Requests)

resources = corev1.ResourceRequirements{
Limits: limits,
Expand All @@ -302,16 +286,21 @@ func buildWorkerTemplate(models []*coreapi.OpenModel, playground *inferenceapi.P
}
}

version := parser.Version()
if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.Version != nil {
version = *playground.Spec.BackendRuntimeConfig.Version
}

template := corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
// TODO: should we support image pull secret here?
// TODO: support readiness/liveness
Containers: []corev1.Container{
{
Name: modelSource.MODEL_RUNNER_CONTAINER_NAME,
Image: bkd.Image(version),
Image: parser.Image(version),
Resources: resources,
Command: bkd.DefaultCommand(),
Command: parser.Commands(),
Args: args,
Env: envs,
Ports: []corev1.ContainerPort{
Expand All @@ -326,7 +315,7 @@ func buildWorkerTemplate(models []*coreapi.OpenModel, playground *inferenceapi.P
},
}

return template
return template, nil
}

func handleUnexpectedCondition(playground *inferenceapi.Playground, modelExists bool, serviceWithSameNameExists bool) (changed bool) {
Expand Down
Loading