Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .golangci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ linters:
- errcheck
- exportloopref
- goconst
- gocyclo
- gofmt
- goimports
- gosimple
Expand Down
6 changes: 4 additions & 2 deletions api/core/v1alpha1/model_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,14 @@ type ModelClaim struct {
type ModelRole string

const (
// Main represents the main model, if only one model is required,
// MainRole represents the main model, if only one model is required,
// it must be the main model. Only one main model is allowed.
MainRole ModelRole = "main"
// Draft represents the draft model in speculative decoding,
// DraftRole represents the draft model in speculative decoding,
// the main model is the target model then.
DraftRole ModelRole = "draft"
// LoraRole represents the lora model.
LoraRole ModelRole = "lora"
)

// ModelRefer refers to a created Model with it's role.
Expand Down
13 changes: 10 additions & 3 deletions api/inference/v1alpha1/config_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,17 @@ type BackendRuntimeConfig struct {
// from the default version.
// +optional
Version *string `json:"version,omitempty"`
// Args represents the arguments appended to the backend.
// You can add new args or overwrite the default args.
// ArgName represents the argument name set in the backendRuntimeArg.
// If not set, will be derived by the model role, e.g. if one model's role
// is <draft>, the argName will be set to <speculative-decoding>. Better to
// set the argName explicitly.
// By default, the argName will be treated as <default> in runtime.
// +optional
Args []string `json:"args,omitempty"`
ArgName *string `json:"argName,omitempty"`
// ArgFlags represents the argument flags appended to the backend.
// You can add new flags or overwrite the default flags.
// +optional
ArgFlags []string `json:"argFlags,omitempty"`
// Envs represents the environments set to the container.
// +optional
Envs []corev1.EnvVar `json:"envs,omitempty"`
Expand Down
9 changes: 7 additions & 2 deletions api/inference/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 11 additions & 3 deletions config/crd/bases/inference.llmaz.io_playgrounds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,21 @@ spec:
BackendRuntimeConfig represents the inference backendRuntime configuration
under the hood, e.g. vLLM, which is the default backendRuntime.
properties:
args:
argFlags:
description: |-
Args represents the arguments appended to the backend.
You can add new args or overwrite the default args.
ArgFlags represents the argument flags appended to the backend.
You can add new flags or overwrite the default flags.
items:
type: string
type: array
argName:
description: |-
ArgName represents the argument name set in the backendRuntimeArg.
If not set, will be derived by the model role, e.g. if one model's role
is <draft>, the argName will be set to <speculative-decoding>. Better to
set the argName explicitly.
By default, the argName will be treated as <default> in runtime.
type: string
envs:
description: Envs represents the environments set to the container.
items:
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/llamacpp/playground.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ spec:
modelName: qwen2-0--5b-gguf
backendRuntimeConfig:
name: llamacpp
args:
argFlags:
- -fa # use flash attention
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ spec:
role: draft
backendRuntimeConfig:
name: llamacpp
args:
argFlags:
- -fa # use flash attention
resources:
requests:
Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/inference/playground_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,14 +257,14 @@ func buildWorkloadTemplate(models []*coreapi.OpenModel, playground *inferenceapi
func buildWorkerTemplate(models []*coreapi.OpenModel, playground *inferenceapi.Playground, backendRuntime *inferenceapi.BackendRuntime) (corev1.PodTemplateSpec, error) {
parser := helper.NewBackendRuntimeParser(backendRuntime)

args, err := parser.Args(helper.PlaygroundInferenceMode(playground), models)
args, err := parser.Args(playground, models)
if err != nil {
return corev1.PodTemplateSpec{}, err
}
envs := parser.Envs()

if playground.Spec.BackendRuntimeConfig != nil {
args = append(args, playground.Spec.BackendRuntimeConfig.Args...)
args = append(args, playground.Spec.BackendRuntimeConfig.ArgFlags...)
envs = append(envs, playground.Spec.BackendRuntimeConfig.Envs...)
}

Expand Down
37 changes: 16 additions & 21 deletions pkg/controller_helper/backendruntime.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,34 +45,29 @@ func (p *BackendRuntimeParser) Envs() []corev1.EnvVar {
return p.backendRuntime.Spec.Envs
}

func (p *BackendRuntimeParser) Args(mode InferenceMode, models []*coreapi.OpenModel) ([]string, error) {
// TODO: add validation in webhook.
if mode == SpeculativeDecodingInferenceMode && len(models) != 2 {
return nil, fmt.Errorf("models number not right, want 2, got %d", len(models))
func (p *BackendRuntimeParser) Args(playground *inferenceapi.Playground, models []*coreapi.OpenModel) ([]string, error) {
var argName string
if playground.Spec.BackendRuntimeConfig != nil && playground.Spec.BackendRuntimeConfig.ArgName != nil {
argName = *playground.Spec.BackendRuntimeConfig.ArgName
} else {
// Auto detect the args from model roles.
argName = DetectArgFrom(playground)
}

modelInfo := map[string]string{}

if mode == DefaultInferenceMode {
source := modelSource.NewModelSourceProvider(models[0])
modelInfo = map[string]string{
"ModelPath": source.ModelPath(),
"ModelName": source.ModelName(),
}
source := modelSource.NewModelSourceProvider(models[0])
modelInfo := map[string]string{
"ModelPath": source.ModelPath(),
"ModelName": source.ModelName(),
}

if mode == SpeculativeDecodingInferenceMode {
targetSource := modelSource.NewModelSourceProvider(models[0])
draftSource := modelSource.NewModelSourceProvider(models[1])
modelInfo = map[string]string{
"ModelPath": targetSource.ModelPath(),
"ModelName": targetSource.ModelName(),
"DraftModelPath": draftSource.ModelPath(),
}
// TODO: This is not that reliable because two models doesn't always means speculative-decoding.
// Revisit this later.
if len(models) > 1 {
modelInfo["DraftModelPath"] = modelSource.NewModelSourceProvider(models[1]).ModelPath()
}

for _, arg := range p.backendRuntime.Spec.Args {
if InferenceMode(arg.Name) == mode {
if arg.Name == argName {
return renderFlags(arg.Flags, modelInfo)
}
}
Expand Down
17 changes: 7 additions & 10 deletions pkg/controller_helper/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,31 +25,28 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)

type InferenceMode string

// These two modes are preset.
const (
DefaultInferenceMode InferenceMode = "default"
SpeculativeDecodingInferenceMode InferenceMode = "speculative-decoding"
DefaultArg string = "default"
SpeculativeDecodingArg string = "speculative-decoding"
)

// PlaygroundInferenceMode gets the mode of inference process, supports default
// or speculative-decoding for now, which is aligned with backendRuntime.
func PlaygroundInferenceMode(playground *inferenceapi.Playground) InferenceMode {
// DetectArgFrom wil auto detect the arg from model roles if not set explicitly.
func DetectArgFrom(playground *inferenceapi.Playground) string {
if playground.Spec.ModelClaim != nil {
return DefaultInferenceMode
return DefaultArg
}

if playground.Spec.ModelClaims != nil {
for _, mr := range playground.Spec.ModelClaims.Models {
if *mr.Role == coreapi.DraftRole {
return SpeculativeDecodingInferenceMode
return SpeculativeDecodingArg
}
}
}

// We should not reach here.
return DefaultInferenceMode
return DefaultArg
}

func FetchModelsByService(ctx context.Context, k8sClient client.Client, service *inferenceapi.Service) (models []*coreapi.OpenModel, err error) {
Expand Down
4 changes: 2 additions & 2 deletions pkg/webhook/playground_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,8 @@ func (w *PlaygroundWebhook) generateValidate(obj runtime.Object) field.ErrorList
}
}

mode := helper.PlaygroundInferenceMode(playground)
if mode == helper.SpeculativeDecodingInferenceMode {
arg := helper.DetectArgFrom(playground)
if arg == helper.SpeculativeDecodingArg {
if len(playground.Spec.ModelClaims.Models) != 2 {
allErrs = append(allErrs, field.Forbidden(specPath.Child("modelClaims", "models"), "only two models are allowed in speculativeDecoding mode"))
}
Expand Down
35 changes: 35 additions & 0 deletions test/config/backends/fake_backend.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
apiVersion: inference.llmaz.io/v1alpha1
kind: BackendRuntime
metadata:
labels:
app.kubernetes.io/name: backendruntime
app.kubernetes.io/part-of: llmaz
app.kubernetes.io/created-by: llmaz
name: fake-backend
spec:
commands:
- sh
- -c
- echo "hello"
image: busybox
version: latest
args:
- name: default
flags:
- mode
- "default"
- name: speculative-decoding
flags:
- mode
- "speculative-decoding"
- name: fuz
flags:
- mode
- "fuz"
resources:
requests:
cpu: 4
memory: 8Gi
limits:
cpu: 4
memory: 8Gi
26 changes: 22 additions & 4 deletions test/integration/controller/inference/playground_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ var _ = ginkgo.Describe("playground controller test", func() {
ginkgo.Entry("advance configured Playground with sglang", &testValidatingCase{
makePlayground: func() *inferenceapi.Playground {
return wrapper.MakePlayground("playground", ns.Name).ModelClaim(model.Name).Label(coreapi.ModelNameLabelKey, model.Name).
BackendRuntime("sglang").BackendRuntimeVersion("main").BackendRuntimeArgs([]string{"--foo", "bar"}).BackendRuntimeEnv("FOO", "BAR").
BackendRuntime("sglang").BackendRuntimeVersion("main").BackendRuntimeArgFlags([]string{"--foo", "bar"}).BackendRuntimeEnv("FOO", "BAR").
BackendRuntimeRequest("cpu", "1").BackendRuntimeLimit("cpu", "10").
Obj()
},
Expand Down Expand Up @@ -211,7 +211,7 @@ var _ = ginkgo.Describe("playground controller test", func() {
ginkgo.Entry("advance configured Playground with llamacpp", &testValidatingCase{
makePlayground: func() *inferenceapi.Playground {
return wrapper.MakePlayground("playground", ns.Name).ModelClaim(model.Name).Label(coreapi.ModelNameLabelKey, model.Name).
BackendRuntime("llamacpp").BackendRuntimeVersion("main").BackendRuntimeArgs([]string{"--foo", "bar"}).BackendRuntimeEnv("FOO", "BAR").
BackendRuntime("llamacpp").BackendRuntimeVersion("main").BackendRuntimeArgFlags([]string{"--foo", "bar"}).BackendRuntimeEnv("FOO", "BAR").
BackendRuntimeRequest("cpu", "1").BackendRuntimeLimit("cpu", "10").
Obj()
},
Expand Down Expand Up @@ -239,7 +239,7 @@ var _ = ginkgo.Describe("playground controller test", func() {
ginkgo.Entry("advance configured Playground with tgi", &testValidatingCase{
makePlayground: func() *inferenceapi.Playground {
return wrapper.MakePlayground("playground", ns.Name).ModelClaim(model.Name).Label(coreapi.ModelNameLabelKey, model.Name).
BackendRuntime("tgi").BackendRuntimeVersion("main").BackendRuntimeArgs([]string{"--model-id", "Qwen/Qwen2-0.5B-Instruct"}).BackendRuntimeEnv("FOO", "BAR").
BackendRuntime("tgi").BackendRuntimeVersion("main").BackendRuntimeArgFlags([]string{"--model-id", "Qwen/Qwen2-0.5B-Instruct"}).BackendRuntimeEnv("FOO", "BAR").
BackendRuntimeRequest("cpu", "1").BackendRuntimeLimit("cpu", "10").
Obj()
},
Expand Down Expand Up @@ -267,7 +267,7 @@ var _ = ginkgo.Describe("playground controller test", func() {
ginkgo.Entry("advance configured Playground with ollama", &testValidatingCase{
makePlayground: func() *inferenceapi.Playground {
return wrapper.MakePlayground("playground", ns.Name).ModelClaim(model.Name).Label(coreapi.ModelNameLabelKey, model.Name).
BackendRuntime("ollama").BackendRuntimeVersion("main").BackendRuntimeArgs([]string{"--foo", "bar"}).BackendRuntimeEnv("FOO", "BAR").
BackendRuntime("ollama").BackendRuntimeVersion("main").BackendRuntimeArgFlags([]string{"--foo", "bar"}).BackendRuntimeEnv("FOO", "BAR").
BackendRuntimeRequest("cpu", "1").BackendRuntimeLimit("cpu", "10").
Obj()
},
Expand All @@ -292,6 +292,24 @@ var _ = ginkgo.Describe("playground controller test", func() {
},
},
}),
ginkgo.Entry("advance configured Playground with argName set", &testValidatingCase{
makePlayground: func() *inferenceapi.Playground {
return wrapper.MakePlayground("playground", ns.Name).ModelClaim(model.Name).Label(coreapi.ModelNameLabelKey, model.Name).
BackendRuntime("fake-backend").BackendRuntimeVersion("main").BackendRuntimeArgName("fuz").BackendRuntimeArgFlags([]string{"--model-id", "Qwen/Qwen2-0.5B-Instruct"}).BackendRuntimeEnv("FOO", "BAR").
BackendRuntimeRequest("cpu", "1").BackendRuntimeLimit("cpu", "10").
Obj()
},
updates: []*update{
{
updateFunc: func(playground *inferenceapi.Playground) {
gomega.Expect(k8sClient.Create(ctx, playground)).To(gomega.Succeed())
},
checkFunc: func(ctx context.Context, k8sClient client.Client, playground *inferenceapi.Playground) {
validation.ValidatePlayground(ctx, k8sClient, playground)
},
},
},
}),
ginkgo.Entry("playground is created when service exists with the same name", &testValidatingCase{
makePlayground: func() *inferenceapi.Playground {
return util.MockASamplePlayground(ns.Name)
Expand Down
18 changes: 17 additions & 1 deletion test/util/validation/validate_playground.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"fmt"
"os"
"slices"
"strings"

"github.com/google/go-cmp/cmp"
"github.com/onsi/gomega"
Expand Down Expand Up @@ -110,11 +111,26 @@ func ValidatePlayground(ctx context.Context, k8sClient client.Client, playground
return fmt.Errorf("expected container image %s, got %s", parser.Image(parser.Version()), service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Image)
}
}
for _, arg := range playground.Spec.BackendRuntimeConfig.Args {

// We assumed the 0-index arg is the default one.
argFlags := backendRuntime.Spec.Args[0].Flags
if playground.Spec.BackendRuntimeConfig.ArgName != nil {
for _, arg := range backendRuntime.Spec.Args {
if arg.Name == *playground.Spec.BackendRuntimeConfig.ArgName {
argFlags = arg.Flags
}
}
}
argFlags = append(argFlags, playground.Spec.BackendRuntimeConfig.ArgFlags...)
for _, arg := range argFlags {
if strings.Contains(arg, "{{") && strings.Contains(arg, "}}") {
continue
}
if !slices.Contains(service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Args, arg) {
return fmt.Errorf("didn't contain arg: %s", arg)
}
}

if diff := cmp.Diff(service.Spec.WorkloadTemplate.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Env, playground.Spec.BackendRuntimeConfig.Envs); diff != "" {
return fmt.Errorf("unexpected envs")
}
Expand Down
Loading
Loading