Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions config/charts/epp-standalone/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@ dependencies:
- name: inference-extension
version: 0.0.0
repository: "file://../inference-extension"
# This is needed to make use of the common values.yaml in ./config/charts/inference-extension/values.yaml
alias: inferenceExtension
12 changes: 12 additions & 0 deletions config/charts/epp-standalone/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -295,4 +295,16 @@ inferenceExtension:
enabled: false

latencyPredictor:
# common latencyPredictor setting exists in config/charts/inference-extension/values.yaml
enabled: false

# Options: ["gke"]
provider:
name: none

# GKE-specific configuration.
# This block is only used if name is "gke".
gke:
# Set to true if the cluster is an Autopilot cluster.
autopilot: false

2 changes: 1 addition & 1 deletion config/charts/inference-extension/templates/_gke.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{{- define "inference-extension.gke" -}}
{{- if eq (lower .Values.provider.name) "gke" }}
{{- if and .Values.provider (eq (lower .Values.provider.name) "gke") }}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

was this a bug?

Copy link
Copy Markdown
Contributor Author

@capri-xiyue capri-xiyue Jan 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously in inferencepool helm chart, as the values.yaml provides a none as a default, we won't have issues. But in epp-standalone helm chart, the default "none" was not provided. Therefore, it will encounter issue. I've made default "none" in this PR and here check provider exists before before accessing the value(more defensive). I was testing "gke" set up, therefore didn't notice this when there was no provider value.

{{- if and .Values.inferenceExtension.monitoring.prometheus.enabled .Values.inferenceExtension.monitoring.prometheus.auth.enabled }}
{{- $metricsReadSA := printf "%s-metrics-reader-sa" .Release.Name -}}
{{- $metricsReadSecretName := printf "%s-metrics-reader-secret" .Release.Name -}}
Expand Down
80 changes: 80 additions & 0 deletions config/charts/inference-extension/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
latencyPredictor:
enabled: false
# Training Server Configuration
trainingServer:
image:
hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars
name: latencypredictor-training-server
tag: latest
pullPolicy: Always
port: 8000
resources:
requests:
cpu: "2000m"
memory: "4Gi"
limits:
cpu: "4000m"
memory: "8Gi"
livenessProbe:
httpGet:
path: /healthz
port: 8000
initialDelaySeconds: 30
periodSeconds: 20
readinessProbe:
httpGet:
path: /readyz
port: 8000
initialDelaySeconds: 45
periodSeconds: 10
volumeSize: "20Gi"
config:
LATENCY_RETRAINING_INTERVAL_SEC: "1"
LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100"
LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib"
LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib"
LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib"
LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib"
LATENCY_MODEL_TYPE: "xgboost"
LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000"
LATENCY_QUANTILE_ALPHA: "0.9"

# Prediction Server Configuration
predictionServers:
count: 10
startPort: 8001
image:
hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars
name: latencypredictor-prediction-server
tag: latest
pullPolicy: Always
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "1000m"
memory: "2Gi"
livenessProbe:
httpGet:
path: /healthz
initialDelaySeconds: 15
periodSeconds: 15
readinessProbe:
httpGet:
path: /readyz
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 10
volumeSize: "10Gi"
config:
LATENCY_MODEL_TYPE: "xgboost"
PREDICT_HOST: "0.0.0.0"
LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib"
LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib"
LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib"
LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib"

# EPP Environment Variables for Latency Predictor
eppEnv:
LATENCY_MAX_SAMPLE_SIZE: "10000"
2 changes: 2 additions & 0 deletions config/charts/inferencepool/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@ dependencies:
- name: inference-extension
version: 0.0.0
repository: "file://../inference-extension"
# This is needed to make use of the common values.yaml in ./config/charts/inference-extension/values.yaml
alias: inferenceExtension
80 changes: 1 addition & 79 deletions config/charts/inferencepool/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,87 +69,9 @@ inferenceExtension:

# Latency Predictor Configuration
latencyPredictor:
# common latencyPredictor setting exists in config/charts/inference-extension/values.yaml
enabled: false

# Training Server Configuration
trainingServer:
image:
hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars
name: latencypredictor-training-server
tag: latest
pullPolicy: Always
port: 8000
resources:
requests:
cpu: "2000m"
memory: "4Gi"
limits:
cpu: "4000m"
memory: "8Gi"
livenessProbe:
httpGet:
path: /healthz
port: 8000
initialDelaySeconds: 30
periodSeconds: 20
readinessProbe:
httpGet:
path: /readyz
port: 8000
initialDelaySeconds: 45
periodSeconds: 10
volumeSize: "20Gi"
config:
LATENCY_RETRAINING_INTERVAL_SEC: "1"
LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100"
LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib"
LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib"
LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib"
LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib"
LATENCY_MODEL_TYPE: "xgboost"
LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000"
LATENCY_QUANTILE_ALPHA: "0.9"

# Prediction Server Configuration
predictionServers:
count: 10
startPort: 8001
image:
hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars
name: latencypredictor-prediction-server
tag: latest
pullPolicy: Always
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "1000m"
memory: "2Gi"
livenessProbe:
httpGet:
path: /healthz
initialDelaySeconds: 15
periodSeconds: 15
readinessProbe:
httpGet:
path: /readyz
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 10
volumeSize: "10Gi"
config:
LATENCY_MODEL_TYPE: "xgboost"
PREDICT_HOST: "0.0.0.0"
LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib"
LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib"
LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib"
LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib"

# EPP Environment Variables for Latency Predictor
eppEnv:
LATENCY_MAX_SAMPLE_SIZE: "10000"

inferencePool:
targetPorts:
- number: 8000
Expand Down
26 changes: 26 additions & 0 deletions hack/verify-helm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ declare -A test_cases_inference_pool
test_cases_inference_pool["basic"]="--set inferencePool.modelServers.matchLabels.app=llm-instance-gateway"
test_cases_inference_pool["gke-provider"]="--set provider.name=gke --set inferencePool.modelServers.matchLabels.app=llm-instance-gateway"
test_cases_inference_pool["multiple-replicas"]="--set inferencePool.replicas=3 --set inferencePool.modelServers.matchLabels.app=llm-instance-gateway"
test_cases_inference_pool["latency-predictor"]="--set inferenceExtension.latencyPredictor.enabled=true --set inferencePool.modelServers.matchLabels.app=llm-instance-gateway"

# Run the install command in case this script runs from a different bash
# source (such as in the verify-all script)
Expand All @@ -46,5 +47,30 @@ for key in "${!test_cases_inference_pool[@]}"; do
fi
done

declare -A test_cases_epp_standalone

# InferencePool Helm Chart test cases
test_cases_epp_standalone["basic"]="--set inferenceExtension.endpointsServer.endpointSelector='app=llm-instance-gateway'"
test_cases_epp_standalone["gke-provider"]="--set provider.name=gke --set inferenceExtension.endpointsServer.endpointSelector='app=llm-instance-gateway'"
test_cases_epp_standalone["latency-predictor"]="--set inferenceExtension.latencyPredictor.enabled=true --set inferenceExtension.endpointsServer.endpointSelector='app=llm-instance-gateway'"


echo "Building dependencies for epp-standalone chart..."
${SCRIPT_ROOT}/bin/helm dependency build ${SCRIPT_ROOT}/config/charts/epp-standalone
if [ $? -ne 0 ]; then
echo "Helm dependency build failed."
exit 1
fi

# Running tests cases
echo "Running helm template command for epp-standalone chart..."
# Loop through the keys of the associative array
for key in "${!test_cases_epp_standalone[@]}"; do
echo "Running test: $key"
${SCRIPT_ROOT}/bin/helm template ${SCRIPT_ROOT}/config/charts/epp-standalone ${test_cases_epp_standalone[$key]} --output-dir="${SCRIPT_ROOT}/bin"
if [ $? -ne 0 ]; then
echo "Helm template command failed for test: $key"
exit 1
fi
done