diff --git a/README.md b/README.md
index 8944eb37..8e7d0c15 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ Easy, advanced inference platform for large language models on Kubernetes
 - **SOTA Inference**: llmaz supports the latest cutting-edge researches like [Speculative Decoding](https://arxiv.org/abs/2211.17192) or [Splitwise](https://arxiv.org/abs/2311.18677)(WIP) to run on Kubernetes.
 - **Various Model Providers**: llmaz supports a wide range of model providers, such as [HuggingFace](https://huggingface.co/), [ModelScope](https://www.modelscope.cn), ObjectStores. llmaz will automatically handle the model loading, requiring no effort from users.
 - **Multi-hosts Support**: llmaz supports both single-host and multi-hosts scenarios with [LWS](https://github.com/kubernetes-sigs/lws) from day 0.
-- **Scaling Efficiency (WIP)**: llmaz works smoothly with autoscaling components like [Cluster-Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) or [Karpenter](https://github.com/kubernetes-sigs/karpenter) to satisfy elastic needs.
+- **Scaling Efficiency**: llmaz supports horizontal scaling with [HPA](./docs/examples/hpa/README.md) by default and will integrate with autoscaling components like [Cluster-Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) or [Karpenter](https://github.com/kubernetes-sigs/karpenter) for smart scaling across different clouds.
 
 ## Quick Start
 
diff --git a/api/inference/v1alpha1/backendruntime_types.go b/api/inference/v1alpha1/backendruntime_types.go
index 8907e717..808222bb 100644
--- a/api/inference/v1alpha1/backendruntime_types.go
+++ b/api/inference/v1alpha1/backendruntime_types.go
@@ -34,10 +34,10 @@ type BackendRuntimeArg struct {
 	Flags []string `json:"flags,omitempty"`
 }
 
-// HPAConfig represents the configuration of the HorizontalPodAutoscaler.
+// HPATrigger represents the configuration of the HorizontalPodAutoscaler.
 // Inspired by kubernetes.io/pkg/apis/autoscaling/types.go#HorizontalPodAutoscalerSpec.
 // Note: HPA component should be installed in prior.
-type HPAConfig struct {
+type HPATrigger struct {
 	// metrics contains the specifications for which to use to calculate the
 	// desired replica count (the maximum replica count across all metrics will
 	// be used).  The desired replica count is calculated multiplying the
@@ -54,11 +54,10 @@ type HPAConfig struct {
 	Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"`
 }
 
-// ScalePolicy defines the policy for scaling the workloads.
-// Support HPA only for now.
-type ScalePolicy struct {
-	// HPA represents the configuration of the HorizontalPodAutoscaler.
-	HPA *HPAConfig `json:"hpa,omitempty"`
+// ScaleTrigger defines the scaler triggers to scale the workloads.
+type ScaleTrigger struct {
+	// HPA represents the trigger configuration of the HorizontalPodAutoscaler.
+	HPA *HPATrigger `json:"hpa,omitempty"`
 }
 
 // MultiHostCommands represents leader & worker commands for multiple nodes scenarios.
@@ -108,10 +107,11 @@ type BackendRuntimeSpec struct {
 	// when it might take a long time to load data or warm a cache, than during steady-state operation.
 	// +optional
 	StartupProbe *corev1.Probe `json:"startupProbe,omitempty"`
-	// ScalePolicy represents the rules for scaling the backend based on the metrics.
-	// If playground doesn't define the ScalePolicy, the defaulted policy here will be used.
+	// ScaleTrigger represents a set of triggers to scale the workloads based on metrics,
+	// only one trigger cloud work at a time and only HPA is supported right now.
+	// If playground doesn't define the ScaleTrigger, the trigger defined here will be used.
 	// +optional
-	ScalePolicy *ScalePolicy `json:"scalePolicy,omitempty"`
+	ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
 }
 
 // BackendRuntimeStatus defines the observed state of BackendRuntime
diff --git a/api/inference/v1alpha1/config_types.go b/api/inference/v1alpha1/config_types.go
index 59afa04d..ac983211 100644
--- a/api/inference/v1alpha1/config_types.go
+++ b/api/inference/v1alpha1/config_types.go
@@ -61,16 +61,18 @@ type ResourceRequirements struct {
 
 type ElasticConfig struct {
 	// MinReplicas indicates the minimum number of inference workloads based on the traffic.
-	// Default to nil means we can scale down the instances to 1.
-	// If minReplicas set to 0, it requires to install serverless component at first.
-	MinReplicas int32 `json:"minReplicas"`
+	// Default to 1.
+	// MinReplicas couldn't be 0 now, will support serverless in the future.
+	// +kubebuilder:default=1
+	// +optional
+	MinReplicas *int32 `json:"minReplicas,omitempty"`
 	// MaxReplicas indicates the maximum number of inference workloads based on the traffic.
 	// Default to nil means there's no limit for the instance number.
 	// +optional
 	MaxReplicas *int32 `json:"maxReplicas,omitempty"`
-	// ScalePolicy defines the rules for scaling the workloads.
-	// If not defined, policy configured in backendRuntime will be used,
-	// otherwise, policy defined here will overwrite the defaulted ones.
+	// ScaleTrigger defines a set of triggers to scale the workloads.
+	// If not defined, trigger configured in backendRuntime will be used,
+	// otherwise, trigger defined here will overwrite the defaulted ones.
 	// +optional
-	ScalePolicy *ScalePolicy `json:"scalePolicy,omitempty"`
+	ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
 }
diff --git a/api/inference/v1alpha1/zz_generated.deepcopy.go b/api/inference/v1alpha1/zz_generated.deepcopy.go
index 69152c21..a14b0ce5 100644
--- a/api/inference/v1alpha1/zz_generated.deepcopy.go
+++ b/api/inference/v1alpha1/zz_generated.deepcopy.go
@@ -192,9 +192,9 @@ func (in *BackendRuntimeSpec) DeepCopyInto(out *BackendRuntimeSpec) {
 		*out = new(v1.Probe)
 		(*in).DeepCopyInto(*out)
 	}
-	if in.ScalePolicy != nil {
-		in, out := &in.ScalePolicy, &out.ScalePolicy
-		*out = new(ScalePolicy)
+	if in.ScaleTrigger != nil {
+		in, out := &in.ScaleTrigger, &out.ScaleTrigger
+		*out = new(ScaleTrigger)
 		(*in).DeepCopyInto(*out)
 	}
 }
@@ -234,14 +234,19 @@ func (in *BackendRuntimeStatus) DeepCopy() *BackendRuntimeStatus {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *ElasticConfig) DeepCopyInto(out *ElasticConfig) {
 	*out = *in
+	if in.MinReplicas != nil {
+		in, out := &in.MinReplicas, &out.MinReplicas
+		*out = new(int32)
+		**out = **in
+	}
 	if in.MaxReplicas != nil {
 		in, out := &in.MaxReplicas, &out.MaxReplicas
 		*out = new(int32)
 		**out = **in
 	}
-	if in.ScalePolicy != nil {
-		in, out := &in.ScalePolicy, &out.ScalePolicy
-		*out = new(ScalePolicy)
+	if in.ScaleTrigger != nil {
+		in, out := &in.ScaleTrigger, &out.ScaleTrigger
+		*out = new(ScaleTrigger)
 		(*in).DeepCopyInto(*out)
 	}
 }
@@ -257,7 +262,7 @@ func (in *ElasticConfig) DeepCopy() *ElasticConfig {
 }
 
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *HPAConfig) DeepCopyInto(out *HPAConfig) {
+func (in *HPATrigger) DeepCopyInto(out *HPATrigger) {
 	*out = *in
 	if in.Metrics != nil {
 		in, out := &in.Metrics, &out.Metrics
@@ -273,12 +278,12 @@ func (in *HPAConfig) DeepCopyInto(out *HPAConfig) {
 	}
 }
 
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HPAConfig.
-func (in *HPAConfig) DeepCopy() *HPAConfig {
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HPATrigger.
+func (in *HPATrigger) DeepCopy() *HPATrigger {
 	if in == nil {
 		return nil
 	}
-	out := new(HPAConfig)
+	out := new(HPATrigger)
 	in.DeepCopyInto(out)
 	return out
 }
@@ -459,21 +464,21 @@ func (in *ResourceRequirements) DeepCopy() *ResourceRequirements {
 }
 
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
-func (in *ScalePolicy) DeepCopyInto(out *ScalePolicy) {
+func (in *ScaleTrigger) DeepCopyInto(out *ScaleTrigger) {
 	*out = *in
 	if in.HPA != nil {
 		in, out := &in.HPA, &out.HPA
-		*out = new(HPAConfig)
+		*out = new(HPATrigger)
 		(*in).DeepCopyInto(*out)
 	}
 }
 
-// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScalePolicy.
-func (in *ScalePolicy) DeepCopy() *ScalePolicy {
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScaleTrigger.
+func (in *ScaleTrigger) DeepCopy() *ScaleTrigger {
 	if in == nil {
 		return nil
 	}
-	out := new(ScalePolicy)
+	out := new(ScaleTrigger)
 	in.DeepCopyInto(out)
 	return out
 }
diff --git a/chart/templates/backends/llamacpp.yaml b/chart/templates/backends/llamacpp.yaml
index 2eedfab6..2b85c24c 100644
--- a/chart/templates/backends/llamacpp.yaml
+++ b/chart/templates/backends/llamacpp.yaml
@@ -24,16 +24,16 @@ spec:
         - --port
         - "8080"
     # TODO: not supported yet, see https://github.com/InftyAI/llmaz/issues/240.
-    - name: speculative-decoding
-      flags:
-        - -m
-        - "{{`{{ .ModelPath }}`}}"
-        - -md
-        - "{{`{{ .DraftModelPath }}`}}"
-        - --host
-        - "0.0.0.0"
-        - --port
-        - "8080"
+    # - name: speculative-decoding
+    #   flags:
+    #     - -m
+    #     - "{{`{{ .ModelPath }}`}}"
+    #     - -md
+    #     - "{{`{{ .DraftModelPath }}`}}"
+    #     - --host
+    #     - "0.0.0.0"
+    #     - --port
+    #     - "8080"
   resources:
     requests:
       cpu: 2
diff --git a/client-go/applyconfiguration/inference/v1alpha1/elasticconfig.go b/client-go/applyconfiguration/inference/v1alpha1/elasticconfig.go
index cef19536..fcc4d84d 100644
--- a/client-go/applyconfiguration/inference/v1alpha1/elasticconfig.go
+++ b/client-go/applyconfiguration/inference/v1alpha1/elasticconfig.go
@@ -20,9 +20,9 @@ package v1alpha1
 // ElasticConfigApplyConfiguration represents a declarative configuration of the ElasticConfig type for use
 // with apply.
 type ElasticConfigApplyConfiguration struct {
-	MinReplicas *int32                         `json:"minReplicas,omitempty"`
-	MaxReplicas *int32                         `json:"maxReplicas,omitempty"`
-	ScalePolicy *ScalePolicyApplyConfiguration `json:"scalePolicy,omitempty"`
+	MinReplicas  *int32                          `json:"minReplicas,omitempty"`
+	MaxReplicas  *int32                          `json:"maxReplicas,omitempty"`
+	ScaleTrigger *ScaleTriggerApplyConfiguration `json:"scaleTrigger,omitempty"`
 }
 
 // ElasticConfigApplyConfiguration constructs a declarative configuration of the ElasticConfig type for use with
@@ -47,10 +47,10 @@ func (b *ElasticConfigApplyConfiguration) WithMaxReplicas(value int32) *ElasticC
 	return b
 }
 
-// WithScalePolicy sets the ScalePolicy field in the declarative configuration to the given value
+// WithScaleTrigger sets the ScaleTrigger field in the declarative configuration to the given value
 // and returns the receiver, so that objects can be built by chaining "With" function invocations.
-// If called multiple times, the ScalePolicy field is set to the value of the last call.
-func (b *ElasticConfigApplyConfiguration) WithScalePolicy(value *ScalePolicyApplyConfiguration) *ElasticConfigApplyConfiguration {
-	b.ScalePolicy = value
+// If called multiple times, the ScaleTrigger field is set to the value of the last call.
+func (b *ElasticConfigApplyConfiguration) WithScaleTrigger(value *ScaleTriggerApplyConfiguration) *ElasticConfigApplyConfiguration {
+	b.ScaleTrigger = value
 	return b
 }
diff --git a/client-go/applyconfiguration/inference/v1alpha1/hpaconfig.go b/client-go/applyconfiguration/inference/v1alpha1/hpatrigger.go
similarity index 72%
rename from client-go/applyconfiguration/inference/v1alpha1/hpaconfig.go
rename to client-go/applyconfiguration/inference/v1alpha1/hpatrigger.go
index a7345c1d..abe35cb7 100644
--- a/client-go/applyconfiguration/inference/v1alpha1/hpaconfig.go
+++ b/client-go/applyconfiguration/inference/v1alpha1/hpatrigger.go
@@ -21,23 +21,23 @@ import (
 	v2 "k8s.io/api/autoscaling/v2"
 )
 
-// HPAConfigApplyConfiguration represents a declarative configuration of the HPAConfig type for use
+// HPATriggerApplyConfiguration represents a declarative configuration of the HPATrigger type for use
 // with apply.
-type HPAConfigApplyConfiguration struct {
+type HPATriggerApplyConfiguration struct {
 	Metrics  []v2.MetricSpec                     `json:"metrics,omitempty"`
 	Behavior *v2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"`
 }
 
-// HPAConfigApplyConfiguration constructs a declarative configuration of the HPAConfig type for use with
+// HPATriggerApplyConfiguration constructs a declarative configuration of the HPATrigger type for use with
 // apply.
-func HPAConfig() *HPAConfigApplyConfiguration {
-	return &HPAConfigApplyConfiguration{}
+func HPATrigger() *HPATriggerApplyConfiguration {
+	return &HPATriggerApplyConfiguration{}
 }
 
 // WithMetrics adds the given value to the Metrics field in the declarative configuration
 // and returns the receiver, so that objects can be build by chaining "With" function invocations.
 // If called multiple times, values provided by each call will be appended to the Metrics field.
-func (b *HPAConfigApplyConfiguration) WithMetrics(values ...v2.MetricSpec) *HPAConfigApplyConfiguration {
+func (b *HPATriggerApplyConfiguration) WithMetrics(values ...v2.MetricSpec) *HPATriggerApplyConfiguration {
 	for i := range values {
 		b.Metrics = append(b.Metrics, values[i])
 	}
@@ -47,7 +47,7 @@ func (b *HPAConfigApplyConfiguration) WithMetrics(values ...v2.MetricSpec) *HPAC
 // WithBehavior sets the Behavior field in the declarative configuration to the given value
 // and returns the receiver, so that objects can be built by chaining "With" function invocations.
 // If called multiple times, the Behavior field is set to the value of the last call.
-func (b *HPAConfigApplyConfiguration) WithBehavior(value v2.HorizontalPodAutoscalerBehavior) *HPAConfigApplyConfiguration {
+func (b *HPATriggerApplyConfiguration) WithBehavior(value v2.HorizontalPodAutoscalerBehavior) *HPATriggerApplyConfiguration {
 	b.Behavior = &value
 	return b
 }
diff --git a/client-go/applyconfiguration/inference/v1alpha1/scalepolicy.go b/client-go/applyconfiguration/inference/v1alpha1/scaletrigger.go
similarity index 63%
rename from client-go/applyconfiguration/inference/v1alpha1/scalepolicy.go
rename to client-go/applyconfiguration/inference/v1alpha1/scaletrigger.go
index d91d4e6e..5bee2bd4 100644
--- a/client-go/applyconfiguration/inference/v1alpha1/scalepolicy.go
+++ b/client-go/applyconfiguration/inference/v1alpha1/scaletrigger.go
@@ -17,22 +17,22 @@ limitations under the License.
 
 package v1alpha1
 
-// ScalePolicyApplyConfiguration represents a declarative configuration of the ScalePolicy type for use
+// ScaleTriggerApplyConfiguration represents a declarative configuration of the ScaleTrigger type for use
 // with apply.
-type ScalePolicyApplyConfiguration struct {
-	HPA *HPAConfigApplyConfiguration `json:"hpa,omitempty"`
+type ScaleTriggerApplyConfiguration struct {
+	HPA *HPATriggerApplyConfiguration `json:"hpa,omitempty"`
 }
 
-// ScalePolicyApplyConfiguration constructs a declarative configuration of the ScalePolicy type for use with
+// ScaleTriggerApplyConfiguration constructs a declarative configuration of the ScaleTrigger type for use with
 // apply.
-func ScalePolicy() *ScalePolicyApplyConfiguration {
-	return &ScalePolicyApplyConfiguration{}
+func ScaleTrigger() *ScaleTriggerApplyConfiguration {
+	return &ScaleTriggerApplyConfiguration{}
 }
 
 // WithHPA sets the HPA field in the declarative configuration to the given value
 // and returns the receiver, so that objects can be built by chaining "With" function invocations.
 // If called multiple times, the HPA field is set to the value of the last call.
-func (b *ScalePolicyApplyConfiguration) WithHPA(value *HPAConfigApplyConfiguration) *ScalePolicyApplyConfiguration {
+func (b *ScaleTriggerApplyConfiguration) WithHPA(value *HPATriggerApplyConfiguration) *ScaleTriggerApplyConfiguration {
 	b.HPA = value
 	return b
 }
diff --git a/client-go/applyconfiguration/utils.go b/client-go/applyconfiguration/utils.go
index 003f8b84..cc2ca402 100644
--- a/client-go/applyconfiguration/utils.go
+++ b/client-go/applyconfiguration/utils.go
@@ -39,8 +39,8 @@ func ForKind(kind schema.GroupVersionKind) interface{} {
 		return &inferencev1alpha1.BackendRuntimeConfigApplyConfiguration{}
 	case v1alpha1.SchemeGroupVersion.WithKind("ElasticConfig"):
 		return &inferencev1alpha1.ElasticConfigApplyConfiguration{}
-	case v1alpha1.SchemeGroupVersion.WithKind("HPAConfig"):
-		return &inferencev1alpha1.HPAConfigApplyConfiguration{}
+	case v1alpha1.SchemeGroupVersion.WithKind("HPATrigger"):
+		return &inferencev1alpha1.HPATriggerApplyConfiguration{}
 	case v1alpha1.SchemeGroupVersion.WithKind("Playground"):
 		return &inferencev1alpha1.PlaygroundApplyConfiguration{}
 	case v1alpha1.SchemeGroupVersion.WithKind("PlaygroundSpec"):
@@ -49,8 +49,8 @@ func ForKind(kind schema.GroupVersionKind) interface{} {
 		return &inferencev1alpha1.PlaygroundStatusApplyConfiguration{}
 	case v1alpha1.SchemeGroupVersion.WithKind("ResourceRequirements"):
 		return &inferencev1alpha1.ResourceRequirementsApplyConfiguration{}
-	case v1alpha1.SchemeGroupVersion.WithKind("ScalePolicy"):
-		return &inferencev1alpha1.ScalePolicyApplyConfiguration{}
+	case v1alpha1.SchemeGroupVersion.WithKind("ScaleTrigger"):
+		return &inferencev1alpha1.ScaleTriggerApplyConfiguration{}
 	case v1alpha1.SchemeGroupVersion.WithKind("Service"):
 		return &inferencev1alpha1.ServiceApplyConfiguration{}
 	case v1alpha1.SchemeGroupVersion.WithKind("ServiceSpec"):
diff --git a/config/crd/bases/inference.llmaz.io_backendruntimes.yaml b/config/crd/bases/inference.llmaz.io_backendruntimes.yaml
index 768e070a..768a30fa 100644
--- a/config/crd/bases/inference.llmaz.io_backendruntimes.yaml
+++ b/config/crd/bases/inference.llmaz.io_backendruntimes.yaml
@@ -547,13 +547,14 @@ spec:
                       More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
                     type: object
                 type: object
-              scalePolicy:
+              scaleTrigger:
                 description: |-
-                  ScalePolicy represents the rules for scaling the backend based on the metrics.
-                  If playground doesn't define the ScalePolicy, the defaulted policy here will be used.
+                  ScaleTrigger represents a set of triggers to scale the workloads based on metrics,
+                  only one trigger cloud work at a time and only HPA is supported right now.
+                  If playground doesn't define the ScaleTrigger, the trigger defined here will be used.
                 properties:
                   hpa:
-                    description: HPA represents the configuration of the HorizontalPodAutoscaler.
+                    description: HPA represents the trigger configuration of the HorizontalPodAutoscaler.
                     properties:
                       behavior:
                         description: |-
diff --git a/config/crd/bases/inference.llmaz.io_playgrounds.yaml b/config/crd/bases/inference.llmaz.io_playgrounds.yaml
index 42a548f9..aef022b1 100644
--- a/config/crd/bases/inference.llmaz.io_playgrounds.yaml
+++ b/config/crd/bases/inference.llmaz.io_playgrounds.yaml
@@ -241,20 +241,22 @@ spec:
                     format: int32
                     type: integer
                   minReplicas:
+                    default: 1
                     description: |-
                       MinReplicas indicates the minimum number of inference workloads based on the traffic.
-                      Default to nil means we can scale down the instances to 1.
-                      If minReplicas set to 0, it requires to install serverless component at first.
+                      Default to 1.
+                      MinReplicas couldn't be 0 now, will support serverless in the future.
                     format: int32
                     type: integer
-                  scalePolicy:
+                  scaleTrigger:
                     description: |-
-                      ScalePolicy defines the rules for scaling the workloads.
-                      If not defined, policy configured in backendRuntime will be used,
-                      otherwise, policy defined here will overwrite the defaulted ones.
+                      ScaleTrigger defines a set of triggers to scale the workloads.
+                      If not defined, trigger configured in backendRuntime will be used,
+                      otherwise, trigger defined here will overwrite the defaulted ones.
                     properties:
                       hpa:
-                        description: HPA represents the configuration of the HorizontalPodAutoscaler.
+                        description: HPA represents the trigger configuration of the
+                          HorizontalPodAutoscaler.
                         properties:
                           behavior:
                             description: |-
@@ -859,8 +861,6 @@ spec:
                             type: array
                         type: object
                     type: object
-                required:
-                - minReplicas
                 type: object
               modelClaim:
                 description: |-
diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml
index 58f2e438..2a540057 100644
--- a/config/manager/kustomization.yaml
+++ b/config/manager/kustomization.yaml
@@ -1,8 +1,8 @@
 resources:
-  - manager.yaml
+- manager.yaml
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 images:
-  - name: controller
-    newName: inftyai/llmaz
-    newTag: v0.0.9
+- name: controller
+  newName: inftyai/test
+  newTag: llmaz-012305
diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
index 407adcb1..4ce1eded 100644
--- a/config/rbac/role.yaml
+++ b/config/rbac/role.yaml
@@ -23,6 +23,18 @@ rules:
   - list
   - update
   - watch
+- apiGroups:
+  - autoscaling
+  resources:
+  - horizontalpodautoscalers
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
 - apiGroups:
   - inference.llmaz.io
   resources:
diff --git a/docs/examples/hpa/README.md b/docs/examples/hpa/README.md
new file mode 100644
index 00000000..a14e7444
--- /dev/null
+++ b/docs/examples/hpa/README.md
@@ -0,0 +1,40 @@
+# Horizontal Scaling With Playgrounds
+
+We only support HPA right now, but will try to integrate with KEDA and Knative in the future.
+
+## Install the Metric Server
+
+HPA depends on the metric-server for scaling decisions, so we need to install it in prior, see install command below:
+
+```cmd
+kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml
+```
+
+## How to Use
+
+If your backendRuntime has already configured the `ScaleTrigger`, set the `playground.elasticConfig` like this:
+
+```yaml
+spec:
+  elasticConfig:
+    minReplicas: 1
+    maxReplicas: 3
+```
+
+If not, you can set the scaleTrigger directly in Playground like this:
+
+```yaml
+spec:
+  elasticConfig:
+    minReplicas: 1
+    maxReplicas: 3
+    scaleTrigger:
+      hpa:
+        metrics:
+          - type: Resource
+            resource:
+              name: cpu
+              target:
+                type: Utilization
+                averageUtilization: 50
+```
diff --git a/docs/examples/hpa/playground.yaml b/docs/examples/hpa/playground.yaml
new file mode 100644
index 00000000..07b68770
--- /dev/null
+++ b/docs/examples/hpa/playground.yaml
@@ -0,0 +1,26 @@
+apiVersion: inference.llmaz.io/v1alpha1
+kind: Playground
+metadata:
+  name: qwen2-0--5b
+spec:
+  replicas: 1
+  modelClaim:
+    modelName: qwen2-0--5b-gguf
+  backendRuntimeConfig:
+    name: llamacpp
+    args:
+      name: "default"
+      flags:
+        - -fa # use flash attention
+  elasticConfig:
+    minReplicas: 1
+    maxReplicas: 3
+    scaleTrigger:
+      hpa:
+        metrics:
+          - type: Resource
+            resource:
+              name: cpu
+              target:
+                averageUtilization: 50
+                type: Utilization
diff --git a/pkg/controller/inference/playground_controller.go b/pkg/controller/inference/playground_controller.go
index 4eba09a3..3e291881 100644
--- a/pkg/controller/inference/playground_controller.go
+++ b/pkg/controller/inference/playground_controller.go
@@ -21,6 +21,7 @@ import (
 	"fmt"
 	"reflect"
 
+	autoscalingv2 "k8s.io/api/autoscaling/v2"
 	corev1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	apimeta "k8s.io/apimachinery/pkg/api/meta"
@@ -30,6 +31,7 @@ import (
 	metaapplyv1 "k8s.io/client-go/applyconfigurations/meta/v1"
 	"k8s.io/client-go/tools/record"
 	"k8s.io/klog/v2"
+	"k8s.io/utils/ptr"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/builder"
 	"sigs.k8s.io/controller-runtime/pkg/client"
@@ -67,6 +69,7 @@ func NewPlaygroundReconciler(client client.Client, scheme *runtime.Scheme, recor
 //+kubebuilder:rbac:groups=inference.llmaz.io,resources=playgrounds,verbs=get;list;watch;create;update;patch;delete
 //+kubebuilder:rbac:groups=inference.llmaz.io,resources=playgrounds/status,verbs=get;update;patch
 //+kubebuilder:rbac:groups=inference.llmaz.io,resources=playgrounds/finalizers,verbs=update
+//+kubebuilder:rbac:groups=autoscaling,resources=horizontalpodautoscalers,verbs=get;list;watch;create;update;patch;delete
 
 // Reconcile is part of the main kubernetes reconciliation loop which aims to
 // move the current state of the cluster closer to the desired state.
@@ -117,17 +120,27 @@ func (r *PlaygroundReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		logger.Error(err, "failed to build inference Service")
 		return ctrl.Result{}, err
 	}
-
 	if err := setControllerReferenceForService(playground, serviceApplyConfiguration, r.Scheme); err != nil {
 		logger.Error(err, "failed to set OwnerReference for Service", "Service", fmt.Sprintf("%s/%s", playground.Namespace, playground.Name))
 		return ctrl.Result{}, err
 	}
-
 	if err := util.Patch(ctx, r.Client, serviceApplyConfiguration); err != nil {
 		logger.Error(err, "failed to patch Service", "Service", fmt.Sprintf("%s/%s", playground.Namespace, playground.Name))
 		return ctrl.Result{}, err
 	}
 
+	scalingConfiguration := buildScalingConfiguration(playground, backendRuntime)
+	if scalingConfiguration != nil {
+		if err := setControllerReferenceForScalingConfiguration(playground, scalingConfiguration, r.Scheme); err != nil {
+			logger.Error(err, "failed to set OwnerReference for scaling workload", "workload", fmt.Sprintf("%s/%s", playground.Namespace, playground.Name), "kind", scalingConfiguration.Kind)
+			return ctrl.Result{}, err
+		}
+		if err := util.Patch(ctx, r.Client, scalingConfiguration); err != nil {
+			logger.Error(err, "failed to patch scaling workload", "workload", fmt.Sprintf("%s/%s", playground.Namespace, playground.Name), "kind", scalingConfiguration.Kind)
+			return ctrl.Result{}, err
+		}
+	}
+
 	// Handle status.
 	setPlaygroundCondition(playground, service)
 	if err := r.Client.Status().Update(ctx, playground); err != nil {
@@ -513,3 +526,79 @@ func setControllerReferenceForService(owner metav1.Object, saf *inferenceclientg
 		WithController(true))
 	return nil
 }
+
+// buildScalingConfiguration supports HPA only now.
+func buildScalingConfiguration(playground *inferenceapi.Playground, backend *inferenceapi.BackendRuntime) *autoscalingv2.HorizontalPodAutoscaler {
+	if playground.Spec.ElasticConfig == nil {
+		return nil
+	}
+
+	// Handle HPA.
+	if (playground.Spec.ElasticConfig.ScaleTrigger != nil && playground.Spec.ElasticConfig.ScaleTrigger.HPA != nil) ||
+		(backend.Spec.ScaleTrigger != nil && backend.Spec.ScaleTrigger.HPA != nil) {
+
+		hpa := &autoscalingv2.HorizontalPodAutoscaler{
+			TypeMeta: metav1.TypeMeta{
+				APIVersion: autoscalingv2.SchemeGroupVersion.String(),
+				Kind:       "HorizontalPodAutoscaler",
+			},
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      playground.Name,
+				Namespace: playground.Namespace,
+			},
+			Spec: autoscalingv2.HorizontalPodAutoscalerSpec{
+				ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{
+					APIVersion: inferenceapi.SchemeGroupVersion.String(),
+					Kind:       "Playground",
+					Name:       playground.Name,
+				},
+			},
+		}
+
+		hpa.Spec.MinReplicas = playground.Spec.ElasticConfig.MinReplicas
+		if playground.Spec.ElasticConfig.MaxReplicas == nil {
+			// The value is hardcoded, because maxReplicas is required by HPA.
+			hpa.Spec.MaxReplicas = 99999
+		} else {
+			hpa.Spec.MaxReplicas = *playground.Spec.ElasticConfig.MaxReplicas
+		}
+
+		if playground.Spec.ElasticConfig.ScaleTrigger != nil && playground.Spec.ElasticConfig.ScaleTrigger.HPA == nil {
+			hpa.Spec.Metrics = playground.Spec.ElasticConfig.ScaleTrigger.HPA.Metrics
+			hpa.Spec.Behavior = playground.Spec.ElasticConfig.ScaleTrigger.HPA.Behavior
+		} else {
+			hpa.Spec.Metrics = backend.Spec.ScaleTrigger.HPA.Metrics
+			hpa.Spec.Behavior = backend.Spec.ScaleTrigger.HPA.Behavior
+		}
+
+		return hpa
+	}
+
+	return nil
+}
+
+func setControllerReferenceForScalingConfiguration(owner metav1.Object, hpa *autoscalingv2.HorizontalPodAutoscaler, scheme *runtime.Scheme) error {
+	if hpa == nil {
+		return nil
+	}
+
+	ro, ok := owner.(runtime.Object)
+	if !ok {
+		return fmt.Errorf("%T is not a runtime.Object, cannot call SetOwnerReference", owner)
+	}
+	gvk, err := apiutil.GVKForObject(ro, scheme)
+	if err != nil {
+		return err
+	}
+	hpa.OwnerReferences = []metav1.OwnerReference{
+		{
+			APIVersion:         gvk.GroupVersion().String(),
+			Kind:               gvk.Kind,
+			Name:               owner.GetName(),
+			UID:                owner.GetUID(),
+			BlockOwnerDeletion: ptr.To[bool](true),
+			Controller:         ptr.To[bool](true),
+		},
+	}
+	return nil
+}
diff --git a/pkg/webhook/playground_webhook.go b/pkg/webhook/playground_webhook.go
index acf44077..53c25839 100644
--- a/pkg/webhook/playground_webhook.go
+++ b/pkg/webhook/playground_webhook.go
@@ -140,5 +140,23 @@ func (w *PlaygroundWebhook) generateValidate(obj runtime.Object) field.ErrorList
 		}
 	}
 
+	if playground.Spec.ElasticConfig != nil {
+		if *playground.Spec.ElasticConfig.MinReplicas == 0 {
+			allErrs = append(allErrs, field.Forbidden(specPath.Child("elasticConfig.minReplicas"), "minReplicas couldn't be 0"))
+		}
+
+		if playground.Spec.ElasticConfig.MinReplicas != nil && playground.Spec.ElasticConfig.MaxReplicas != nil {
+			if *playground.Spec.ElasticConfig.MinReplicas >= *playground.Spec.ElasticConfig.MaxReplicas {
+				allErrs = append(allErrs, field.Invalid(specPath.Child("elasticConfig.scaleTrigger.hpa"), *playground.Spec.ElasticConfig.MinReplicas, "minReplicas must be less than maxReplicas"))
+			}
+		}
+
+		if playground.Spec.ElasticConfig.ScaleTrigger != nil {
+			if playground.Spec.ElasticConfig.ScaleTrigger.HPA == nil {
+				allErrs = append(allErrs, field.Forbidden(specPath.Child("elasticConfig.scaleTrigger.hpa"), "hpa couldn't be nil"))
+			}
+		}
+	}
+
 	return allErrs
 }
diff --git a/test/config/backends/llamacpp.yaml b/test/config/backends/llamacpp.yaml
index df1fe360..ea4554e4 100644
--- a/test/config/backends/llamacpp.yaml
+++ b/test/config/backends/llamacpp.yaml
@@ -21,20 +21,20 @@ spec:
         - --port
         - "8080"
     # TODO: not supported yet, see https://github.com/InftyAI/llmaz/issues/240.
-    - name: speculative-decoding
-      flags:
-        - -m
-        - "{{ .ModelPath }}"
-        - -md
-        - "{{ .DraftModelPath }}"
-        - --host
-        - "0.0.0.0"
-        - --port
-        - "8080"
-        - --draft-max
-        - "16"
-        - --draft-min
-        - "5"
+    # - name: speculative-decoding
+    #   flags:
+    #     - -m
+    #     - "{{ .ModelPath }}"
+    #     - -md
+    #     - "{{ .DraftModelPath }}"
+    #     - --host
+    #     - "0.0.0.0"
+    #     - --port
+    #     - "8080"
+    #     - --draft-max
+    #     - "16"
+    #     - --draft-min
+    #     - "5"
   resources:
     requests:
       cpu: 2
diff --git a/test/integration/webhook/playground_test.go b/test/integration/webhook/playground_test.go
index f7b10b63..03041980 100644
--- a/test/integration/webhook/playground_test.go
+++ b/test/integration/webhook/playground_test.go
@@ -111,6 +111,30 @@ var _ = ginkgo.Describe("Playground default and validation", func() {
 			},
 			failed: true,
 		}),
+		ginkgo.Entry("hpa couldn't be nil once elasticConfig is not nil", &testValidatingCase{
+			playground: func() *inferenceapi.Playground {
+				return wrapper.MakePlayground("playground", ns.Name).ModelClaim("llama3-8b").Replicas(1).HPA(nil).Obj()
+			},
+			failed: true,
+		}),
+		ginkgo.Entry("minReplicas is 0 once elasticConfig is not nil", &testValidatingCase{
+			playground: func() *inferenceapi.Playground {
+				return wrapper.MakePlayground("playground", ns.Name).ModelClaim("llama3-8b").Replicas(1).ElasticConfig(0, 10).Obj()
+			},
+			failed: true,
+		}),
+		ginkgo.Entry("minReplicas is greater than maxReplicas", &testValidatingCase{
+			playground: func() *inferenceapi.Playground {
+				return wrapper.MakePlayground("playground", ns.Name).ModelClaim("llama3-8b").Replicas(1).ElasticConfig(10, 1).Obj()
+			},
+			failed: true,
+		}),
+		ginkgo.Entry("minReplicas is equal with maxReplicas", &testValidatingCase{
+			playground: func() *inferenceapi.Playground {
+				return wrapper.MakePlayground("playground", ns.Name).ModelClaim("llama3-8b").Replicas(1).ElasticConfig(1, 1).Obj()
+			},
+			failed: true,
+		}),
 	)
 
 	type testDefaultingCase struct {
diff --git a/test/util/wrapper/playground.go b/test/util/wrapper/playground.go
index 71643d63..5b2daef3 100644
--- a/test/util/wrapper/playground.go
+++ b/test/util/wrapper/playground.go
@@ -163,7 +163,18 @@ func (w *PlaygroundWrapper) BackendRuntimeLimit(r, v string) *PlaygroundWrapper
 func (w *PlaygroundWrapper) ElasticConfig(minReplicas, maxReplicas int32) *PlaygroundWrapper {
 	w.Spec.ElasticConfig = &inferenceapi.ElasticConfig{
 		MaxReplicas: ptr.To[int32](maxReplicas),
-		MinReplicas: minReplicas,
+		MinReplicas: ptr.To[int32](minReplicas),
 	}
 	return w
 }
+
+func (w *PlaygroundWrapper) HPA(config *inferenceapi.HPATrigger) *PlaygroundWrapper {
+	if w.Spec.ElasticConfig == nil {
+		w.Spec.ElasticConfig = &inferenceapi.ElasticConfig{}
+	}
+	if w.Spec.ElasticConfig.ScaleTrigger == nil {
+		w.Spec.ElasticConfig.ScaleTrigger = &inferenceapi.ScaleTrigger{}
+	}
+	w.Spec.ElasticConfig.ScaleTrigger.HPA = config
+	return w
+}