Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions .github/workflows/publish-helm-chart.yaml

This file was deleted.

3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -295,15 +295,14 @@ $(HELMIFY): $(LOCALBIN)

.PHONY: helm
helm: manifests kustomize helmify
$(KUBECTL) create namespace llmaz-system --dry-run=client -o yaml | $(KUBECTL) apply -f -
$(KUSTOMIZE) build config/default | $(HELMIFY) -crd-dir

.PHONY: helm-install
helm-install: helm
helm upgrade --install llmaz ./chart --namespace llmaz-system --create-namespace -f ./chart/values.global.yaml

.PHONY: helm-package
helm-package:
helm-package: helm
# Make sure will alwasy start with a new line.
printf "\n" >> ./chart/values.yaml
cat ./chart/values.global.yaml >> ./chart/values.yaml
Expand Down
8 changes: 6 additions & 2 deletions api/core/v1alpha1/model_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,14 @@ const (
DraftRole ModelRole = "draft"
)

type ModelRepresentative struct {
// ModelRefer refers to a created Model with it's role.
type ModelRefer struct {
// Name represents the model name.
Name ModelName `json:"name"`
// Role represents the model role once more than one model is required.
// Such as a draft role, which means running with SpeculativeDecoding,
// and default arguments for backend will be searched in backendRuntime
// with the name of speculative-decoding.
// +kubebuilder:validation:Enum={main,draft}
// +kubebuilder:default=main
// +optional
Expand All @@ -148,7 +152,7 @@ type ModelClaims struct {
// speculative decoding, then one model is main(target) model, another one
// is draft model.
// +kubebuilder:validation:MinItems=1
Models []ModelRepresentative `json:"models,omitempty"`
Models []ModelRefer `json:"models,omitempty"`
// InferenceFlavors represents a list of flavors with fungibility supported
// to serve the model.
// - If not set, always apply with the 0-index model by default.
Expand Down
10 changes: 5 additions & 5 deletions api/core/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 10 additions & 14 deletions api/inference/v1alpha1/backendruntime_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,15 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

type InferenceMode string

const (
DefaultInferenceMode InferenceMode = "Default"
SpeculativeDecodingInferenceMode InferenceMode = "SpeculativeDecoding"
)

// BackendRuntimeArg is preset arguments for easy to use.
// Do not edit the preset names unless set the argument name explicitly
// in Playground backendRuntimeConfig.
type BackendRuntimeArg struct {
Mode InferenceMode `json:"mode"`
Flags []string `json:"flags,omitempty"`
// Name represents the identifier of the backendRuntime argument.
Name string `json:"name"`
// Flags represents all the preset configurations.
// Flag around with {{ .CONFIG }} is a configuration waiting for render.
Flags []string `json:"flags,omitempty"`
}

// BackendRuntimeSpec defines the desired state of BackendRuntime
Expand All @@ -43,11 +42,8 @@ type BackendRuntimeSpec struct {
// Version represents the default version of the backendRuntime.
// It will be appended to the image as a tag.
Version string `json:"version"`
// Args represents the args of the backendRuntime.
// They can be appended or overwritten by the Playground args.
// The key is the inference option, like default one or advanced
// speculativeDecoding, the values are the corresponding args.
// Flag around with {{ .XXX }} is a flag waiting for render.
// Args represents the preset arguments of the backendRuntime.
// They can be appended or overwritten by the Playground backendRuntimeConfig.
Args []BackendRuntimeArg `json:"args,omitempty"`
// Envs represents the environments set to the container.
// +optional
Expand Down
4 changes: 2 additions & 2 deletions chart/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.0.2
version: 0.0.3
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "0.0.6"
appVersion: 0.0.7
20 changes: 13 additions & 7 deletions chart/crds/backendruntime-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,21 +42,27 @@ spec:
properties:
args:
description: |-
Args represents the args of the backendRuntime.
They can be appended or overwritten by the Playground args.
The key is the inference option, like default one or advanced
speculativeDecoding, the values are the corresponding args.
Flag around with {{ .XXX }} is a flag waiting for render.
Args represents the preset arguments of the backendRuntime.
They can be appended or overwritten by the Playground backendRuntimeConfig.
items:
description: |-
BackendRuntimeArg is preset arguments for easy to use.
Do not edit the preset names unless set the argument name explicitly
in Playground backendRuntimeConfig.
properties:
flags:
description: |-
Flags represents all the preset configurations.
Flag around with {{ .CONFIG }} is a configuration waiting for render.
items:
type: string
type: array
mode:
name:
description: Name represents the identifier of the backendRuntime
argument.
type: string
required:
- mode
- name
type: object
type: array
commands:
Expand Down
9 changes: 7 additions & 2 deletions chart/crds/playground-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -259,14 +259,19 @@ spec:
speculative decoding, then one model is main(target) model, another one
is draft model.
items:
description: ModelRefer refers to a created Model with it's
role.
properties:
name:
description: Name represents the model name.
type: string
role:
default: main
description: Role represents the model role once more than
one model is required.
description: |-
Role represents the model role once more than one model is required.
Such as a draft role, which means running with SpeculativeDecoding,
and default arguments for backend will be searched in backendRuntime
with the name of speculative-decoding.
enum:
- main
- draft
Expand Down
9 changes: 7 additions & 2 deletions chart/crds/service-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,19 @@ spec:
speculative decoding, then one model is main(target) model, another one
is draft model.
items:
description: ModelRefer refers to a created Model with it's
role.
properties:
name:
description: Name represents the model name.
type: string
role:
default: main
description: Role represents the model role once more than
one model is required.
description: |-
Role represents the model role once more than one model is required.
Such as a draft role, which means running with SpeculativeDecoding,
and default arguments for backend will be searched in backendRuntime
with the name of speculative-decoding.
enum:
- main
- draft
Expand Down
6 changes: 4 additions & 2 deletions chart/templates/backends/llamacpp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,18 @@ spec:
- ./llama-server
image: ghcr.io/ggerganov/llama.cpp
version: server
# Do not edit the preset argument name unless you know what you're doing.
# Free to add more arguments with your requirements.
args:
- mode: Default
- name: default
flags:
- -m
- "{{`{{ .ModelPath }}`}}"
- --host
- "0.0.0.0"
- --port
- "8080"
- mode: SpeculativeDecoding
- name: speculative-decoding
flags:
- -m
- "{{`{{ .ModelPath }}`}}"
Expand Down
4 changes: 3 additions & 1 deletion chart/templates/backends/sglang.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ spec:
- sglang.launch_server
image: lmsysorg/sglang
version: v0.2.10-cu121
# Do not edit the preset argument name unless you know what you're doing.
# Free to add more arguments with your requirements.
args:
- mode: Default
- name: default
flags:
- --model-path
- "{{`{{ .ModelPath }}`}}"
Expand Down
6 changes: 4 additions & 2 deletions chart/templates/backends/vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ spec:
- vllm.entrypoints.openai.api_server
image: vllm/vllm-openai
version: v0.6.0
# Do not edit the preset argument name unless you know what you're doing.
# Free to add more arguments with your requirements.
args:
- mode: Default
- name: default
flags:
- --model
- "{{`{{ .ModelPath }}`}}"
Expand All @@ -25,7 +27,7 @@ spec:
- "0.0.0.0"
- --port
- "8080"
- mode: SpeculativeDecoding
- name: speculative-decoding
flags:
- --model
- "{{`{{ .ModelPath }}`}}"
Expand Down
6 changes: 3 additions & 3 deletions client-go/applyconfiguration/core/v1alpha1/modelclaims.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions client-go/applyconfiguration/utils.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 13 additions & 7 deletions config/crd/bases/inference.llmaz.io_backendruntimes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,27 @@ spec:
properties:
args:
description: |-
Args represents the args of the backendRuntime.
They can be appended or overwritten by the Playground args.
The key is the inference option, like default one or advanced
speculativeDecoding, the values are the corresponding args.
Flag around with {{ .XXX }} is a flag waiting for render.
Args represents the preset arguments of the backendRuntime.
They can be appended or overwritten by the Playground backendRuntimeConfig.
items:
description: |-
BackendRuntimeArg is preset arguments for easy to use.
Do not edit the preset names unless set the argument name explicitly
in Playground backendRuntimeConfig.
properties:
flags:
description: |-
Flags represents all the preset configurations.
Flag around with {{ .CONFIG }} is a configuration waiting for render.
items:
type: string
type: array
mode:
name:
description: Name represents the identifier of the backendRuntime
argument.
type: string
required:
- mode
- name
type: object
type: array
commands:
Expand Down
9 changes: 7 additions & 2 deletions config/crd/bases/inference.llmaz.io_playgrounds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -260,14 +260,19 @@ spec:
speculative decoding, then one model is main(target) model, another one
is draft model.
items:
description: ModelRefer refers to a created Model with it's
role.
properties:
name:
description: Name represents the model name.
type: string
role:
default: main
description: Role represents the model role once more than
one model is required.
description: |-
Role represents the model role once more than one model is required.
Such as a draft role, which means running with SpeculativeDecoding,
and default arguments for backend will be searched in backendRuntime
with the name of speculative-decoding.
enum:
- main
- draft
Expand Down
Loading