InftyAI · InftyAI-Agent · Jan 23, 2025 · Jan 23, 2025 · Jan 23, 2025
diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ Easy, advanced inference platform for large language models on Kubernetes
 - **SOTA Inference**: llmaz supports the latest cutting-edge researches like [Speculative Decoding](https://arxiv.org/abs/2211.17192) or [Splitwise](https://arxiv.org/abs/2311.18677)(WIP) to run on Kubernetes.
 - **Various Model Providers**: llmaz supports a wide range of model providers, such as [HuggingFace](https://huggingface.co/), [ModelScope](https://www.modelscope.cn), ObjectStores. llmaz will automatically handle the model loading, requiring no effort from users.
 - **Multi-hosts Support**: llmaz supports both single-host and multi-hosts scenarios with [LWS](https://github.com/kubernetes-sigs/lws) from day 0.
-- **Scaling Efficiency (WIP)**: llmaz works smoothly with autoscaling components like [Cluster-Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) or [Karpenter](https://github.com/kubernetes-sigs/karpenter) to satisfy elastic needs.
+- **Scaling Efficiency**: llmaz supports horizontal scaling with [HPA](./docs/examples/hpa/README.md) by default and will integrate with autoscaling components like [Cluster-Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) or [Karpenter](https://github.com/kubernetes-sigs/karpenter) for smart scaling across different clouds.
 
 ## Quick Start
 

diff --git a/api/inference/v1alpha1/backendruntime_types.go b/api/inference/v1alpha1/backendruntime_types.go
@@ -34,10 +34,10 @@ type BackendRuntimeArg struct {
 	Flags []string `json:"flags,omitempty"`
 }
 
-// HPAConfig represents the configuration of the HorizontalPodAutoscaler.
+// HPATrigger represents the configuration of the HorizontalPodAutoscaler.
 // Inspired by kubernetes.io/pkg/apis/autoscaling/types.go#HorizontalPodAutoscalerSpec.
 // Note: HPA component should be installed in prior.
-type HPAConfig struct {
+type HPATrigger struct {
 	// metrics contains the specifications for which to use to calculate the
 	// desired replica count (the maximum replica count across all metrics will
 	// be used).  The desired replica count is calculated multiplying the
@@ -54,11 +54,10 @@ type HPAConfig struct {
 	Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"`
 }
 
-// ScalePolicy defines the policy for scaling the workloads.
-// Support HPA only for now.
-type ScalePolicy struct {
-	// HPA represents the configuration of the HorizontalPodAutoscaler.
-	HPA *HPAConfig `json:"hpa,omitempty"`
+// ScaleTrigger defines the scaler triggers to scale the workloads.
+type ScaleTrigger struct {
+	// HPA represents the trigger configuration of the HorizontalPodAutoscaler.
+	HPA *HPATrigger `json:"hpa,omitempty"`
 }
 
 // MultiHostCommands represents leader & worker commands for multiple nodes scenarios.
@@ -108,10 +107,11 @@ type BackendRuntimeSpec struct {
 	// when it might take a long time to load data or warm a cache, than during steady-state operation.
 	// +optional
 	StartupProbe *corev1.Probe `json:"startupProbe,omitempty"`
-	// ScalePolicy represents the rules for scaling the backend based on the metrics.
-	// If playground doesn't define the ScalePolicy, the defaulted policy here will be used.
+	// ScaleTrigger represents a set of triggers to scale the workloads based on metrics,
+	// only one trigger cloud work at a time and only HPA is supported right now.
+	// If playground doesn't define the ScaleTrigger, the trigger defined here will be used.
 	// +optional
-	ScalePolicy *ScalePolicy `json:"scalePolicy,omitempty"`
+	ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
 }
 
 // BackendRuntimeStatus defines the observed state of BackendRuntime

diff --git a/api/inference/v1alpha1/config_types.go b/api/inference/v1alpha1/config_types.go
@@ -61,16 +61,18 @@ type ResourceRequirements struct {
 
 type ElasticConfig struct {
 	// MinReplicas indicates the minimum number of inference workloads based on the traffic.
-	// Default to nil means we can scale down the instances to 1.
-	// If minReplicas set to 0, it requires to install serverless component at first.
-	MinReplicas int32 `json:"minReplicas"`
+	// Default to 1.
+	// MinReplicas couldn't be 0 now, will support serverless in the future.
+	// +kubebuilder:default=1
+	// +optional
+	MinReplicas *int32 `json:"minReplicas,omitempty"`
 	// MaxReplicas indicates the maximum number of inference workloads based on the traffic.
 	// Default to nil means there's no limit for the instance number.
 	// +optional
 	MaxReplicas *int32 `json:"maxReplicas,omitempty"`
-	// ScalePolicy defines the rules for scaling the workloads.
-	// If not defined, policy configured in backendRuntime will be used,
-	// otherwise, policy defined here will overwrite the defaulted ones.
+	// ScaleTrigger defines a set of triggers to scale the workloads.
+	// If not defined, trigger configured in backendRuntime will be used,
+	// otherwise, trigger defined here will overwrite the defaulted ones.
 	// +optional
-	ScalePolicy *ScalePolicy `json:"scalePolicy,omitempty"`
+	ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
 }
diff --git a/api/inference/v1alpha1/zz_generated.deepcopy.go b/api/inference/v1alpha1/zz_generated.deepcopy.go
diff --git a/chart/templates/backends/llamacpp.yaml b/chart/templates/backends/llamacpp.yaml
@@ -24,16 +24,16 @@ spec:
         - --port
         - "8080"
     # TODO: not supported yet, see https://github.com/InftyAI/llmaz/issues/240.
-    - name: speculative-decoding
-      flags:
-        - -m
-        - "{{`{{ .ModelPath }}`}}"
-        - -md
-        - "{{`{{ .DraftModelPath }}`}}"
-        - --host
-        - "0.0.0.0"
-        - --port
-        - "8080"
+    # - name: speculative-decoding
+    #   flags:
+    #     - -m
+    #     - "{{`{{ .ModelPath }}`}}"
+    #     - -md
+    #     - "{{`{{ .DraftModelPath }}`}}"
+    #     - --host
+    #     - "0.0.0.0"
+    #     - --port
+    #     - "8080"
   resources:
     requests:
       cpu: 2

diff --git a/client-go/applyconfiguration/inference/v1alpha1/elasticconfig.go b/client-go/applyconfiguration/inference/v1alpha1/elasticconfig.go
diff --git a/...iguration/inference/v1alpha1/hpaconfig.go → ...guration/inference/v1alpha1/hpatrigger.go b/...iguration/inference/v1alpha1/hpaconfig.go → ...guration/inference/v1alpha1/hpatrigger.go
diff --git a/...uration/inference/v1alpha1/scalepolicy.go → ...ration/inference/v1alpha1/scaletrigger.go b/...uration/inference/v1alpha1/scalepolicy.go → ...ration/inference/v1alpha1/scaletrigger.go
diff --git a/client-go/applyconfiguration/utils.go b/client-go/applyconfiguration/utils.go
diff --git a/config/crd/bases/inference.llmaz.io_backendruntimes.yaml b/config/crd/bases/inference.llmaz.io_backendruntimes.yaml
@@ -547,13 +547,14 @@ spec:
                       More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
                     type: object
                 type: object
-              scalePolicy:
+              scaleTrigger:
                 description: |-
-                  ScalePolicy represents the rules for scaling the backend based on the metrics.
-                  If playground doesn't define the ScalePolicy, the defaulted policy here will be used.
+                  ScaleTrigger represents a set of triggers to scale the workloads based on metrics,
+                  only one trigger cloud work at a time and only HPA is supported right now.
+                  If playground doesn't define the ScaleTrigger, the trigger defined here will be used.
                 properties:
                   hpa:
-                    description: HPA represents the configuration of the HorizontalPodAutoscaler.
+                    description: HPA represents the trigger configuration of the HorizontalPodAutoscaler.
                     properties:
                       behavior:
                         description: |-

diff --git a/config/crd/bases/inference.llmaz.io_playgrounds.yaml b/config/crd/bases/inference.llmaz.io_playgrounds.yaml
@@ -241,20 +241,22 @@ spec:
                     format: int32
                     type: integer
                   minReplicas:
+                    default: 1
                     description: |-
                       MinReplicas indicates the minimum number of inference workloads based on the traffic.
-                      Default to nil means we can scale down the instances to 1.
-                      If minReplicas set to 0, it requires to install serverless component at first.
+                      Default to 1.
+                      MinReplicas couldn't be 0 now, will support serverless in the future.
                     format: int32
                     type: integer
-                  scalePolicy:
+                  scaleTrigger:
                     description: |-
-                      ScalePolicy defines the rules for scaling the workloads.
-                      If not defined, policy configured in backendRuntime will be used,
-                      otherwise, policy defined here will overwrite the defaulted ones.
+                      ScaleTrigger defines a set of triggers to scale the workloads.
+                      If not defined, trigger configured in backendRuntime will be used,
+                      otherwise, trigger defined here will overwrite the defaulted ones.
                     properties:
                       hpa:
-                        description: HPA represents the configuration of the HorizontalPodAutoscaler.
+                        description: HPA represents the trigger configuration of the
+                          HorizontalPodAutoscaler.
                         properties:
                           behavior:
                             description: |-
@@ -859,8 +861,6 @@ spec:
                             type: array
                         type: object
                     type: object
-                required:
-                - minReplicas
                 type: object
               modelClaim:
                 description: |-

diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml
@@ -1,8 +1,8 @@
 resources:
-  - manager.yaml
+- manager.yaml
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 images:
-  - name: controller
-    newName: inftyai/llmaz
-    newTag: v0.0.9
+- name: controller
+  newName: inftyai/test
+  newTag: llmaz-012305