diff --git a/README.md b/README.md index 8944eb37..8e7d0c15 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Easy, advanced inference platform for large language models on Kubernetes - **SOTA Inference**: llmaz supports the latest cutting-edge researches like [Speculative Decoding](https://arxiv.org/abs/2211.17192) or [Splitwise](https://arxiv.org/abs/2311.18677)(WIP) to run on Kubernetes. - **Various Model Providers**: llmaz supports a wide range of model providers, such as [HuggingFace](https://huggingface.co/), [ModelScope](https://www.modelscope.cn), ObjectStores. llmaz will automatically handle the model loading, requiring no effort from users. - **Multi-hosts Support**: llmaz supports both single-host and multi-hosts scenarios with [LWS](https://github.com/kubernetes-sigs/lws) from day 0. -- **Scaling Efficiency (WIP)**: llmaz works smoothly with autoscaling components like [Cluster-Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) or [Karpenter](https://github.com/kubernetes-sigs/karpenter) to satisfy elastic needs. +- **Scaling Efficiency**: llmaz supports horizontal scaling with [HPA](./docs/examples/hpa/README.md) by default and will integrate with autoscaling components like [Cluster-Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) or [Karpenter](https://github.com/kubernetes-sigs/karpenter) for smart scaling across different clouds. ## Quick Start diff --git a/api/inference/v1alpha1/backendruntime_types.go b/api/inference/v1alpha1/backendruntime_types.go index 8907e717..808222bb 100644 --- a/api/inference/v1alpha1/backendruntime_types.go +++ b/api/inference/v1alpha1/backendruntime_types.go @@ -34,10 +34,10 @@ type BackendRuntimeArg struct { Flags []string `json:"flags,omitempty"` } -// HPAConfig represents the configuration of the HorizontalPodAutoscaler. +// HPATrigger represents the configuration of the HorizontalPodAutoscaler. // Inspired by kubernetes.io/pkg/apis/autoscaling/types.go#HorizontalPodAutoscalerSpec. // Note: HPA component should be installed in prior. -type HPAConfig struct { +type HPATrigger struct { // metrics contains the specifications for which to use to calculate the // desired replica count (the maximum replica count across all metrics will // be used). The desired replica count is calculated multiplying the @@ -54,11 +54,10 @@ type HPAConfig struct { Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"` } -// ScalePolicy defines the policy for scaling the workloads. -// Support HPA only for now. -type ScalePolicy struct { - // HPA represents the configuration of the HorizontalPodAutoscaler. - HPA *HPAConfig `json:"hpa,omitempty"` +// ScaleTrigger defines the scaler triggers to scale the workloads. +type ScaleTrigger struct { + // HPA represents the trigger configuration of the HorizontalPodAutoscaler. + HPA *HPATrigger `json:"hpa,omitempty"` } // MultiHostCommands represents leader & worker commands for multiple nodes scenarios. @@ -108,10 +107,11 @@ type BackendRuntimeSpec struct { // when it might take a long time to load data or warm a cache, than during steady-state operation. // +optional StartupProbe *corev1.Probe `json:"startupProbe,omitempty"` - // ScalePolicy represents the rules for scaling the backend based on the metrics. - // If playground doesn't define the ScalePolicy, the defaulted policy here will be used. + // ScaleTrigger represents a set of triggers to scale the workloads based on metrics, + // only one trigger cloud work at a time and only HPA is supported right now. + // If playground doesn't define the ScaleTrigger, the trigger defined here will be used. // +optional - ScalePolicy *ScalePolicy `json:"scalePolicy,omitempty"` + ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"` } // BackendRuntimeStatus defines the observed state of BackendRuntime diff --git a/api/inference/v1alpha1/config_types.go b/api/inference/v1alpha1/config_types.go index 59afa04d..ac983211 100644 --- a/api/inference/v1alpha1/config_types.go +++ b/api/inference/v1alpha1/config_types.go @@ -61,16 +61,18 @@ type ResourceRequirements struct { type ElasticConfig struct { // MinReplicas indicates the minimum number of inference workloads based on the traffic. - // Default to nil means we can scale down the instances to 1. - // If minReplicas set to 0, it requires to install serverless component at first. - MinReplicas int32 `json:"minReplicas"` + // Default to 1. + // MinReplicas couldn't be 0 now, will support serverless in the future. + // +kubebuilder:default=1 + // +optional + MinReplicas *int32 `json:"minReplicas,omitempty"` // MaxReplicas indicates the maximum number of inference workloads based on the traffic. // Default to nil means there's no limit for the instance number. // +optional MaxReplicas *int32 `json:"maxReplicas,omitempty"` - // ScalePolicy defines the rules for scaling the workloads. - // If not defined, policy configured in backendRuntime will be used, - // otherwise, policy defined here will overwrite the defaulted ones. + // ScaleTrigger defines a set of triggers to scale the workloads. + // If not defined, trigger configured in backendRuntime will be used, + // otherwise, trigger defined here will overwrite the defaulted ones. // +optional - ScalePolicy *ScalePolicy `json:"scalePolicy,omitempty"` + ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"` } diff --git a/api/inference/v1alpha1/zz_generated.deepcopy.go b/api/inference/v1alpha1/zz_generated.deepcopy.go index 69152c21..a14b0ce5 100644 --- a/api/inference/v1alpha1/zz_generated.deepcopy.go +++ b/api/inference/v1alpha1/zz_generated.deepcopy.go @@ -192,9 +192,9 @@ func (in *BackendRuntimeSpec) DeepCopyInto(out *BackendRuntimeSpec) { *out = new(v1.Probe) (*in).DeepCopyInto(*out) } - if in.ScalePolicy != nil { - in, out := &in.ScalePolicy, &out.ScalePolicy - *out = new(ScalePolicy) + if in.ScaleTrigger != nil { + in, out := &in.ScaleTrigger, &out.ScaleTrigger + *out = new(ScaleTrigger) (*in).DeepCopyInto(*out) } } @@ -234,14 +234,19 @@ func (in *BackendRuntimeStatus) DeepCopy() *BackendRuntimeStatus { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ElasticConfig) DeepCopyInto(out *ElasticConfig) { *out = *in + if in.MinReplicas != nil { + in, out := &in.MinReplicas, &out.MinReplicas + *out = new(int32) + **out = **in + } if in.MaxReplicas != nil { in, out := &in.MaxReplicas, &out.MaxReplicas *out = new(int32) **out = **in } - if in.ScalePolicy != nil { - in, out := &in.ScalePolicy, &out.ScalePolicy - *out = new(ScalePolicy) + if in.ScaleTrigger != nil { + in, out := &in.ScaleTrigger, &out.ScaleTrigger + *out = new(ScaleTrigger) (*in).DeepCopyInto(*out) } } @@ -257,7 +262,7 @@ func (in *ElasticConfig) DeepCopy() *ElasticConfig { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *HPAConfig) DeepCopyInto(out *HPAConfig) { +func (in *HPATrigger) DeepCopyInto(out *HPATrigger) { *out = *in if in.Metrics != nil { in, out := &in.Metrics, &out.Metrics @@ -273,12 +278,12 @@ func (in *HPAConfig) DeepCopyInto(out *HPAConfig) { } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HPAConfig. -func (in *HPAConfig) DeepCopy() *HPAConfig { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HPATrigger. +func (in *HPATrigger) DeepCopy() *HPATrigger { if in == nil { return nil } - out := new(HPAConfig) + out := new(HPATrigger) in.DeepCopyInto(out) return out } @@ -459,21 +464,21 @@ func (in *ResourceRequirements) DeepCopy() *ResourceRequirements { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ScalePolicy) DeepCopyInto(out *ScalePolicy) { +func (in *ScaleTrigger) DeepCopyInto(out *ScaleTrigger) { *out = *in if in.HPA != nil { in, out := &in.HPA, &out.HPA - *out = new(HPAConfig) + *out = new(HPATrigger) (*in).DeepCopyInto(*out) } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScalePolicy. -func (in *ScalePolicy) DeepCopy() *ScalePolicy { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScaleTrigger. +func (in *ScaleTrigger) DeepCopy() *ScaleTrigger { if in == nil { return nil } - out := new(ScalePolicy) + out := new(ScaleTrigger) in.DeepCopyInto(out) return out } diff --git a/chart/templates/backends/llamacpp.yaml b/chart/templates/backends/llamacpp.yaml index 2eedfab6..2b85c24c 100644 --- a/chart/templates/backends/llamacpp.yaml +++ b/chart/templates/backends/llamacpp.yaml @@ -24,16 +24,16 @@ spec: - --port - "8080" # TODO: not supported yet, see https://github.com/InftyAI/llmaz/issues/240. - - name: speculative-decoding - flags: - - -m - - "{{`{{ .ModelPath }}`}}" - - -md - - "{{`{{ .DraftModelPath }}`}}" - - --host - - "0.0.0.0" - - --port - - "8080" + # - name: speculative-decoding + # flags: + # - -m + # - "{{`{{ .ModelPath }}`}}" + # - -md + # - "{{`{{ .DraftModelPath }}`}}" + # - --host + # - "0.0.0.0" + # - --port + # - "8080" resources: requests: cpu: 2 diff --git a/client-go/applyconfiguration/inference/v1alpha1/elasticconfig.go b/client-go/applyconfiguration/inference/v1alpha1/elasticconfig.go index cef19536..fcc4d84d 100644 --- a/client-go/applyconfiguration/inference/v1alpha1/elasticconfig.go +++ b/client-go/applyconfiguration/inference/v1alpha1/elasticconfig.go @@ -20,9 +20,9 @@ package v1alpha1 // ElasticConfigApplyConfiguration represents a declarative configuration of the ElasticConfig type for use // with apply. type ElasticConfigApplyConfiguration struct { - MinReplicas *int32 `json:"minReplicas,omitempty"` - MaxReplicas *int32 `json:"maxReplicas,omitempty"` - ScalePolicy *ScalePolicyApplyConfiguration `json:"scalePolicy,omitempty"` + MinReplicas *int32 `json:"minReplicas,omitempty"` + MaxReplicas *int32 `json:"maxReplicas,omitempty"` + ScaleTrigger *ScaleTriggerApplyConfiguration `json:"scaleTrigger,omitempty"` } // ElasticConfigApplyConfiguration constructs a declarative configuration of the ElasticConfig type for use with @@ -47,10 +47,10 @@ func (b *ElasticConfigApplyConfiguration) WithMaxReplicas(value int32) *ElasticC return b } -// WithScalePolicy sets the ScalePolicy field in the declarative configuration to the given value +// WithScaleTrigger sets the ScaleTrigger field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the ScalePolicy field is set to the value of the last call. -func (b *ElasticConfigApplyConfiguration) WithScalePolicy(value *ScalePolicyApplyConfiguration) *ElasticConfigApplyConfiguration { - b.ScalePolicy = value +// If called multiple times, the ScaleTrigger field is set to the value of the last call. +func (b *ElasticConfigApplyConfiguration) WithScaleTrigger(value *ScaleTriggerApplyConfiguration) *ElasticConfigApplyConfiguration { + b.ScaleTrigger = value return b } diff --git a/client-go/applyconfiguration/inference/v1alpha1/hpaconfig.go b/client-go/applyconfiguration/inference/v1alpha1/hpatrigger.go similarity index 72% rename from client-go/applyconfiguration/inference/v1alpha1/hpaconfig.go rename to client-go/applyconfiguration/inference/v1alpha1/hpatrigger.go index a7345c1d..abe35cb7 100644 --- a/client-go/applyconfiguration/inference/v1alpha1/hpaconfig.go +++ b/client-go/applyconfiguration/inference/v1alpha1/hpatrigger.go @@ -21,23 +21,23 @@ import ( v2 "k8s.io/api/autoscaling/v2" ) -// HPAConfigApplyConfiguration represents a declarative configuration of the HPAConfig type for use +// HPATriggerApplyConfiguration represents a declarative configuration of the HPATrigger type for use // with apply. -type HPAConfigApplyConfiguration struct { +type HPATriggerApplyConfiguration struct { Metrics []v2.MetricSpec `json:"metrics,omitempty"` Behavior *v2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"` } -// HPAConfigApplyConfiguration constructs a declarative configuration of the HPAConfig type for use with +// HPATriggerApplyConfiguration constructs a declarative configuration of the HPATrigger type for use with // apply. -func HPAConfig() *HPAConfigApplyConfiguration { - return &HPAConfigApplyConfiguration{} +func HPATrigger() *HPATriggerApplyConfiguration { + return &HPATriggerApplyConfiguration{} } // WithMetrics adds the given value to the Metrics field in the declarative configuration // and returns the receiver, so that objects can be build by chaining "With" function invocations. // If called multiple times, values provided by each call will be appended to the Metrics field. -func (b *HPAConfigApplyConfiguration) WithMetrics(values ...v2.MetricSpec) *HPAConfigApplyConfiguration { +func (b *HPATriggerApplyConfiguration) WithMetrics(values ...v2.MetricSpec) *HPATriggerApplyConfiguration { for i := range values { b.Metrics = append(b.Metrics, values[i]) } @@ -47,7 +47,7 @@ func (b *HPAConfigApplyConfiguration) WithMetrics(values ...v2.MetricSpec) *HPAC // WithBehavior sets the Behavior field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Behavior field is set to the value of the last call. -func (b *HPAConfigApplyConfiguration) WithBehavior(value v2.HorizontalPodAutoscalerBehavior) *HPAConfigApplyConfiguration { +func (b *HPATriggerApplyConfiguration) WithBehavior(value v2.HorizontalPodAutoscalerBehavior) *HPATriggerApplyConfiguration { b.Behavior = &value return b } diff --git a/client-go/applyconfiguration/inference/v1alpha1/scalepolicy.go b/client-go/applyconfiguration/inference/v1alpha1/scaletrigger.go similarity index 63% rename from client-go/applyconfiguration/inference/v1alpha1/scalepolicy.go rename to client-go/applyconfiguration/inference/v1alpha1/scaletrigger.go index d91d4e6e..5bee2bd4 100644 --- a/client-go/applyconfiguration/inference/v1alpha1/scalepolicy.go +++ b/client-go/applyconfiguration/inference/v1alpha1/scaletrigger.go @@ -17,22 +17,22 @@ limitations under the License. package v1alpha1 -// ScalePolicyApplyConfiguration represents a declarative configuration of the ScalePolicy type for use +// ScaleTriggerApplyConfiguration represents a declarative configuration of the ScaleTrigger type for use // with apply. -type ScalePolicyApplyConfiguration struct { - HPA *HPAConfigApplyConfiguration `json:"hpa,omitempty"` +type ScaleTriggerApplyConfiguration struct { + HPA *HPATriggerApplyConfiguration `json:"hpa,omitempty"` } -// ScalePolicyApplyConfiguration constructs a declarative configuration of the ScalePolicy type for use with +// ScaleTriggerApplyConfiguration constructs a declarative configuration of the ScaleTrigger type for use with // apply. -func ScalePolicy() *ScalePolicyApplyConfiguration { - return &ScalePolicyApplyConfiguration{} +func ScaleTrigger() *ScaleTriggerApplyConfiguration { + return &ScaleTriggerApplyConfiguration{} } // WithHPA sets the HPA field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the HPA field is set to the value of the last call. -func (b *ScalePolicyApplyConfiguration) WithHPA(value *HPAConfigApplyConfiguration) *ScalePolicyApplyConfiguration { +func (b *ScaleTriggerApplyConfiguration) WithHPA(value *HPATriggerApplyConfiguration) *ScaleTriggerApplyConfiguration { b.HPA = value return b } diff --git a/client-go/applyconfiguration/utils.go b/client-go/applyconfiguration/utils.go index 003f8b84..cc2ca402 100644 --- a/client-go/applyconfiguration/utils.go +++ b/client-go/applyconfiguration/utils.go @@ -39,8 +39,8 @@ func ForKind(kind schema.GroupVersionKind) interface{} { return &inferencev1alpha1.BackendRuntimeConfigApplyConfiguration{} case v1alpha1.SchemeGroupVersion.WithKind("ElasticConfig"): return &inferencev1alpha1.ElasticConfigApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("HPAConfig"): - return &inferencev1alpha1.HPAConfigApplyConfiguration{} + case v1alpha1.SchemeGroupVersion.WithKind("HPATrigger"): + return &inferencev1alpha1.HPATriggerApplyConfiguration{} case v1alpha1.SchemeGroupVersion.WithKind("Playground"): return &inferencev1alpha1.PlaygroundApplyConfiguration{} case v1alpha1.SchemeGroupVersion.WithKind("PlaygroundSpec"): @@ -49,8 +49,8 @@ func ForKind(kind schema.GroupVersionKind) interface{} { return &inferencev1alpha1.PlaygroundStatusApplyConfiguration{} case v1alpha1.SchemeGroupVersion.WithKind("ResourceRequirements"): return &inferencev1alpha1.ResourceRequirementsApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("ScalePolicy"): - return &inferencev1alpha1.ScalePolicyApplyConfiguration{} + case v1alpha1.SchemeGroupVersion.WithKind("ScaleTrigger"): + return &inferencev1alpha1.ScaleTriggerApplyConfiguration{} case v1alpha1.SchemeGroupVersion.WithKind("Service"): return &inferencev1alpha1.ServiceApplyConfiguration{} case v1alpha1.SchemeGroupVersion.WithKind("ServiceSpec"): diff --git a/config/crd/bases/inference.llmaz.io_backendruntimes.yaml b/config/crd/bases/inference.llmaz.io_backendruntimes.yaml index 768e070a..768a30fa 100644 --- a/config/crd/bases/inference.llmaz.io_backendruntimes.yaml +++ b/config/crd/bases/inference.llmaz.io_backendruntimes.yaml @@ -547,13 +547,14 @@ spec: More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object type: object - scalePolicy: + scaleTrigger: description: |- - ScalePolicy represents the rules for scaling the backend based on the metrics. - If playground doesn't define the ScalePolicy, the defaulted policy here will be used. + ScaleTrigger represents a set of triggers to scale the workloads based on metrics, + only one trigger cloud work at a time and only HPA is supported right now. + If playground doesn't define the ScaleTrigger, the trigger defined here will be used. properties: hpa: - description: HPA represents the configuration of the HorizontalPodAutoscaler. + description: HPA represents the trigger configuration of the HorizontalPodAutoscaler. properties: behavior: description: |- diff --git a/config/crd/bases/inference.llmaz.io_playgrounds.yaml b/config/crd/bases/inference.llmaz.io_playgrounds.yaml index 42a548f9..aef022b1 100644 --- a/config/crd/bases/inference.llmaz.io_playgrounds.yaml +++ b/config/crd/bases/inference.llmaz.io_playgrounds.yaml @@ -241,20 +241,22 @@ spec: format: int32 type: integer minReplicas: + default: 1 description: |- MinReplicas indicates the minimum number of inference workloads based on the traffic. - Default to nil means we can scale down the instances to 1. - If minReplicas set to 0, it requires to install serverless component at first. + Default to 1. + MinReplicas couldn't be 0 now, will support serverless in the future. format: int32 type: integer - scalePolicy: + scaleTrigger: description: |- - ScalePolicy defines the rules for scaling the workloads. - If not defined, policy configured in backendRuntime will be used, - otherwise, policy defined here will overwrite the defaulted ones. + ScaleTrigger defines a set of triggers to scale the workloads. + If not defined, trigger configured in backendRuntime will be used, + otherwise, trigger defined here will overwrite the defaulted ones. properties: hpa: - description: HPA represents the configuration of the HorizontalPodAutoscaler. + description: HPA represents the trigger configuration of the + HorizontalPodAutoscaler. properties: behavior: description: |- @@ -859,8 +861,6 @@ spec: type: array type: object type: object - required: - - minReplicas type: object modelClaim: description: |- diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 58f2e438..2a540057 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -1,8 +1,8 @@ resources: - - manager.yaml +- manager.yaml apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization images: - - name: controller - newName: inftyai/llmaz - newTag: v0.0.9 +- name: controller + newName: inftyai/test + newTag: llmaz-012305 diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 407adcb1..4ce1eded 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -23,6 +23,18 @@ rules: - list - update - watch +- apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - inference.llmaz.io resources: diff --git a/docs/examples/hpa/README.md b/docs/examples/hpa/README.md new file mode 100644 index 00000000..a14e7444 --- /dev/null +++ b/docs/examples/hpa/README.md @@ -0,0 +1,40 @@ +# Horizontal Scaling With Playgrounds + +We only support HPA right now, but will try to integrate with KEDA and Knative in the future. + +## Install the Metric Server + +HPA depends on the metric-server for scaling decisions, so we need to install it in prior, see install command below: + +```cmd +kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml +``` + +## How to Use + +If your backendRuntime has already configured the `ScaleTrigger`, set the `playground.elasticConfig` like this: + +```yaml +spec: + elasticConfig: + minReplicas: 1 + maxReplicas: 3 +``` + +If not, you can set the scaleTrigger directly in Playground like this: + +```yaml +spec: + elasticConfig: + minReplicas: 1 + maxReplicas: 3 + scaleTrigger: + hpa: + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 50 +``` diff --git a/docs/examples/hpa/playground.yaml b/docs/examples/hpa/playground.yaml new file mode 100644 index 00000000..07b68770 --- /dev/null +++ b/docs/examples/hpa/playground.yaml @@ -0,0 +1,26 @@ +apiVersion: inference.llmaz.io/v1alpha1 +kind: Playground +metadata: + name: qwen2-0--5b +spec: + replicas: 1 + modelClaim: + modelName: qwen2-0--5b-gguf + backendRuntimeConfig: + name: llamacpp + args: + name: "default" + flags: + - -fa # use flash attention + elasticConfig: + minReplicas: 1 + maxReplicas: 3 + scaleTrigger: + hpa: + metrics: + - type: Resource + resource: + name: cpu + target: + averageUtilization: 50 + type: Utilization diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go index 4eba09a3..3e291881 100644 --- a/pkg/controller/inference/playground_controller.go +++ b/pkg/controller/inference/playground_controller.go @@ -21,6 +21,7 @@ import ( "fmt" "reflect" + autoscalingv2 "k8s.io/api/autoscaling/v2" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" @@ -30,6 +31,7 @@ import ( metaapplyv1 "k8s.io/client-go/applyconfigurations/meta/v1" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" @@ -67,6 +69,7 @@ func NewPlaygroundReconciler(client client.Client, scheme *runtime.Scheme, recor //+kubebuilder:rbac:groups=inference.llmaz.io,resources=playgrounds,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=inference.llmaz.io,resources=playgrounds/status,verbs=get;update;patch //+kubebuilder:rbac:groups=inference.llmaz.io,resources=playgrounds/finalizers,verbs=update +//+kubebuilder:rbac:groups=autoscaling,resources=horizontalpodautoscalers,verbs=get;list;watch;create;update;patch;delete // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. @@ -117,17 +120,27 @@ func (r *PlaygroundReconciler) Reconcile(ctx context.Context, req ctrl.Request) logger.Error(err, "failed to build inference Service") return ctrl.Result{}, err } - if err := setControllerReferenceForService(playground, serviceApplyConfiguration, r.Scheme); err != nil { logger.Error(err, "failed to set OwnerReference for Service", "Service", fmt.Sprintf("%s/%s", playground.Namespace, playground.Name)) return ctrl.Result{}, err } - if err := util.Patch(ctx, r.Client, serviceApplyConfiguration); err != nil { logger.Error(err, "failed to patch Service", "Service", fmt.Sprintf("%s/%s", playground.Namespace, playground.Name)) return ctrl.Result{}, err } + scalingConfiguration := buildScalingConfiguration(playground, backendRuntime) + if scalingConfiguration != nil { + if err := setControllerReferenceForScalingConfiguration(playground, scalingConfiguration, r.Scheme); err != nil { + logger.Error(err, "failed to set OwnerReference for scaling workload", "workload", fmt.Sprintf("%s/%s", playground.Namespace, playground.Name), "kind", scalingConfiguration.Kind) + return ctrl.Result{}, err + } + if err := util.Patch(ctx, r.Client, scalingConfiguration); err != nil { + logger.Error(err, "failed to patch scaling workload", "workload", fmt.Sprintf("%s/%s", playground.Namespace, playground.Name), "kind", scalingConfiguration.Kind) + return ctrl.Result{}, err + } + } + // Handle status. setPlaygroundCondition(playground, service) if err := r.Client.Status().Update(ctx, playground); err != nil { @@ -513,3 +526,79 @@ func setControllerReferenceForService(owner metav1.Object, saf *inferenceclientg WithController(true)) return nil } + +// buildScalingConfiguration supports HPA only now. +func buildScalingConfiguration(playground *inferenceapi.Playground, backend *inferenceapi.BackendRuntime) *autoscalingv2.HorizontalPodAutoscaler { + if playground.Spec.ElasticConfig == nil { + return nil + } + + // Handle HPA. + if (playground.Spec.ElasticConfig.ScaleTrigger != nil && playground.Spec.ElasticConfig.ScaleTrigger.HPA != nil) || + (backend.Spec.ScaleTrigger != nil && backend.Spec.ScaleTrigger.HPA != nil) { + + hpa := &autoscalingv2.HorizontalPodAutoscaler{ + TypeMeta: metav1.TypeMeta{ + APIVersion: autoscalingv2.SchemeGroupVersion.String(), + Kind: "HorizontalPodAutoscaler", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: playground.Name, + Namespace: playground.Namespace, + }, + Spec: autoscalingv2.HorizontalPodAutoscalerSpec{ + ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{ + APIVersion: inferenceapi.SchemeGroupVersion.String(), + Kind: "Playground", + Name: playground.Name, + }, + }, + } + + hpa.Spec.MinReplicas = playground.Spec.ElasticConfig.MinReplicas + if playground.Spec.ElasticConfig.MaxReplicas == nil { + // The value is hardcoded, because maxReplicas is required by HPA. + hpa.Spec.MaxReplicas = 99999 + } else { + hpa.Spec.MaxReplicas = *playground.Spec.ElasticConfig.MaxReplicas + } + + if playground.Spec.ElasticConfig.ScaleTrigger != nil && playground.Spec.ElasticConfig.ScaleTrigger.HPA == nil { + hpa.Spec.Metrics = playground.Spec.ElasticConfig.ScaleTrigger.HPA.Metrics + hpa.Spec.Behavior = playground.Spec.ElasticConfig.ScaleTrigger.HPA.Behavior + } else { + hpa.Spec.Metrics = backend.Spec.ScaleTrigger.HPA.Metrics + hpa.Spec.Behavior = backend.Spec.ScaleTrigger.HPA.Behavior + } + + return hpa + } + + return nil +} + +func setControllerReferenceForScalingConfiguration(owner metav1.Object, hpa *autoscalingv2.HorizontalPodAutoscaler, scheme *runtime.Scheme) error { + if hpa == nil { + return nil + } + + ro, ok := owner.(runtime.Object) + if !ok { + return fmt.Errorf("%T is not a runtime.Object, cannot call SetOwnerReference", owner) + } + gvk, err := apiutil.GVKForObject(ro, scheme) + if err != nil { + return err + } + hpa.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: gvk.GroupVersion().String(), + Kind: gvk.Kind, + Name: owner.GetName(), + UID: owner.GetUID(), + BlockOwnerDeletion: ptr.To[bool](true), + Controller: ptr.To[bool](true), + }, + } + return nil +} diff --git a/pkg/webhook/playground_webhook.go b/pkg/webhook/playground_webhook.go index acf44077..53c25839 100644 --- a/pkg/webhook/playground_webhook.go +++ b/pkg/webhook/playground_webhook.go @@ -140,5 +140,23 @@ func (w *PlaygroundWebhook) generateValidate(obj runtime.Object) field.ErrorList } } + if playground.Spec.ElasticConfig != nil { + if *playground.Spec.ElasticConfig.MinReplicas == 0 { + allErrs = append(allErrs, field.Forbidden(specPath.Child("elasticConfig.minReplicas"), "minReplicas couldn't be 0")) + } + + if playground.Spec.ElasticConfig.MinReplicas != nil && playground.Spec.ElasticConfig.MaxReplicas != nil { + if *playground.Spec.ElasticConfig.MinReplicas >= *playground.Spec.ElasticConfig.MaxReplicas { + allErrs = append(allErrs, field.Invalid(specPath.Child("elasticConfig.scaleTrigger.hpa"), *playground.Spec.ElasticConfig.MinReplicas, "minReplicas must be less than maxReplicas")) + } + } + + if playground.Spec.ElasticConfig.ScaleTrigger != nil { + if playground.Spec.ElasticConfig.ScaleTrigger.HPA == nil { + allErrs = append(allErrs, field.Forbidden(specPath.Child("elasticConfig.scaleTrigger.hpa"), "hpa couldn't be nil")) + } + } + } + return allErrs } diff --git a/test/config/backends/llamacpp.yaml b/test/config/backends/llamacpp.yaml index df1fe360..ea4554e4 100644 --- a/test/config/backends/llamacpp.yaml +++ b/test/config/backends/llamacpp.yaml @@ -21,20 +21,20 @@ spec: - --port - "8080" # TODO: not supported yet, see https://github.com/InftyAI/llmaz/issues/240. - - name: speculative-decoding - flags: - - -m - - "{{ .ModelPath }}" - - -md - - "{{ .DraftModelPath }}" - - --host - - "0.0.0.0" - - --port - - "8080" - - --draft-max - - "16" - - --draft-min - - "5" + # - name: speculative-decoding + # flags: + # - -m + # - "{{ .ModelPath }}" + # - -md + # - "{{ .DraftModelPath }}" + # - --host + # - "0.0.0.0" + # - --port + # - "8080" + # - --draft-max + # - "16" + # - --draft-min + # - "5" resources: requests: cpu: 2 diff --git a/test/integration/webhook/playground_test.go b/test/integration/webhook/playground_test.go index f7b10b63..03041980 100644 --- a/test/integration/webhook/playground_test.go +++ b/test/integration/webhook/playground_test.go @@ -111,6 +111,30 @@ var _ = ginkgo.Describe("Playground default and validation", func() { }, failed: true, }), + ginkgo.Entry("hpa couldn't be nil once elasticConfig is not nil", &testValidatingCase{ + playground: func() *inferenceapi.Playground { + return wrapper.MakePlayground("playground", ns.Name).ModelClaim("llama3-8b").Replicas(1).HPA(nil).Obj() + }, + failed: true, + }), + ginkgo.Entry("minReplicas is 0 once elasticConfig is not nil", &testValidatingCase{ + playground: func() *inferenceapi.Playground { + return wrapper.MakePlayground("playground", ns.Name).ModelClaim("llama3-8b").Replicas(1).ElasticConfig(0, 10).Obj() + }, + failed: true, + }), + ginkgo.Entry("minReplicas is greater than maxReplicas", &testValidatingCase{ + playground: func() *inferenceapi.Playground { + return wrapper.MakePlayground("playground", ns.Name).ModelClaim("llama3-8b").Replicas(1).ElasticConfig(10, 1).Obj() + }, + failed: true, + }), + ginkgo.Entry("minReplicas is equal with maxReplicas", &testValidatingCase{ + playground: func() *inferenceapi.Playground { + return wrapper.MakePlayground("playground", ns.Name).ModelClaim("llama3-8b").Replicas(1).ElasticConfig(1, 1).Obj() + }, + failed: true, + }), ) type testDefaultingCase struct { diff --git a/test/util/wrapper/playground.go b/test/util/wrapper/playground.go index 71643d63..5b2daef3 100644 --- a/test/util/wrapper/playground.go +++ b/test/util/wrapper/playground.go @@ -163,7 +163,18 @@ func (w *PlaygroundWrapper) BackendRuntimeLimit(r, v string) *PlaygroundWrapper func (w *PlaygroundWrapper) ElasticConfig(minReplicas, maxReplicas int32) *PlaygroundWrapper { w.Spec.ElasticConfig = &inferenceapi.ElasticConfig{ MaxReplicas: ptr.To[int32](maxReplicas), - MinReplicas: minReplicas, + MinReplicas: ptr.To[int32](minReplicas), } return w } + +func (w *PlaygroundWrapper) HPA(config *inferenceapi.HPATrigger) *PlaygroundWrapper { + if w.Spec.ElasticConfig == nil { + w.Spec.ElasticConfig = &inferenceapi.ElasticConfig{} + } + if w.Spec.ElasticConfig.ScaleTrigger == nil { + w.Spec.ElasticConfig.ScaleTrigger = &inferenceapi.ScaleTrigger{} + } + w.Spec.ElasticConfig.ScaleTrigger.HPA = config + return w +}