RyanRosario
diff --git a/‎config/charts/inferencepool/README.md‎
Lines changed: 13 additions & 13 deletions b/‎config/charts/inferencepool/README.md‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎config/charts/inferencepool/values.yaml‎
Lines changed: 1 addition & 1 deletion b/‎config/charts/inferencepool/values.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎config/charts/standalone/values.yaml‎
Lines changed: 1 addition & 1 deletion b/‎config/charts/standalone/values.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎config/manifests/inferenceobjective.yaml‎
Lines changed: 4 additions & 4 deletions b/‎config/manifests/inferenceobjective.yaml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎config/manifests/sglang/gpu-deployment.yaml‎
Lines changed: 5 additions & 5 deletions b/‎config/manifests/sglang/gpu-deployment.yaml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎config/manifests/vllm/cpu-deployment.yaml‎
Lines changed: 11 additions & 11 deletions b/‎config/manifests/vllm/cpu-deployment.yaml‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎config/manifests/vllm/gpu-deployment.yaml‎
Lines changed: 10 additions & 10 deletions b/‎config/manifests/vllm/gpu-deployment.yaml‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎config/observability/prometheus/values.yaml‎
Lines changed: 1 addition & 1 deletion b/‎config/observability/prometheus/values.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/proposals/1816-inferenceomodelrewrite/README.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/proposals/1816-inferenceomodelrewrite/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pkg/epp/handlers/response.go‎
Lines changed: 1 addition & 1 deletion b/‎pkg/epp/handlers/response.go‎
Lines changed: 1 addition & 1 deletion
@@ -4,18 +4,18 @@ A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) depl
 
 ## Install
 
-To install an InferencePool named `vllm-llama3-8b-instruct`  that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port `8000`, you can run the following command:
+To install an InferencePool named `vllm-qwen3-32b`  that selects from endpoints with label `app: vllm-qwen3-32b` and listening on port `8000`, you can run the following command:
 
 ```txt
-$ helm install vllm-llama3-8b-instruct ./config/charts/inferencepool \
-  --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
+$ helm install vllm-qwen3-32b ./config/charts/inferencepool \
+  --set inferencePool.modelServers.matchLabels.app=vllm-qwen3-32b \
 ```
 
 To install via the latest published chart in staging  (--version v0 indicates latest dev version), you can run the following command:
 
 ```txt
-$ helm install vllm-llama3-8b-instruct \
-  --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
+$ helm install vllm-qwen3-32b \
+  --set inferencePool.modelServers.matchLabels.app=vllm-qwen3-32b \
   --set provider.name=[none|gke|istio] \
   oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
 ```
@@ -27,8 +27,8 @@ Note that the provider name is needed to deploy provider-specific resources. If
 To set cmd-line flags, you can use the `--set` option to set each flag, e.g.,:
 
 ```txt
-$ helm install vllm-llama3-8b-instruct \
-  --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
+$ helm install vllm-qwen3-32b \
+  --set inferencePool.modelServers.matchLabels.app=vllm-qwen3-32b \
   --set inferenceExtension.flags.<FLAG_NAME>=<FLAG_VALUE>
   --set provider.name=[none|gke|istio] \
   oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
@@ -64,7 +64,7 @@ inferenceExtension:
 Then apply it with:
 
 ```txt
-$ helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml
+$ helm install vllm-qwen3-32b ./config/charts/inferencepool -f values.yaml
 ```
 
 ### Install with Custom EPP Plugins Configuration
@@ -106,7 +106,7 @@ inferenceExtension:
 Then apply it with:
 
 ```txt
-$ helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml
+$ helm install vllm-qwen3-32b ./config/charts/inferencepool -f values.yaml
 ```
 
 ### Install for Triton TensorRT-LLM
@@ -159,8 +159,8 @@ To enable HA, set `inferenceExtension.replicas` to a number greater than 1.
 * Via `--set` flag:
 
   ```txt
-  helm install vllm-llama3-8b-instruct \
-  --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
+  helm install vllm-qwen3-32b \
+  --set inferencePool.modelServers.matchLabels.app=vllm-qwen3-32b \
   --set inferenceExtension.replicas=3 \
   --set provider=[none|gke] \
   oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
@@ -176,7 +176,7 @@ To enable HA, set `inferenceExtension.replicas` to a number greater than 1.
   Then apply it with:
 
   ```txt
-  helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml
+  helm install vllm-qwen3-32b ./config/charts/inferencepool -f values.yaml
   ```
 
 ### Install with Monitoring
@@ -204,7 +204,7 @@ If you are using a GKE Autopilot cluster, you also need to set `provider.gke.aut
 Then apply it with:
 
 ```txt
-helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml
+helm install vllm-qwen3-32b ./config/charts/inferencepool -f values.yaml
 ```
 
 ## Uninstall
 
@@ -87,7 +87,7 @@ inferencePool:
   apiVersion: inference.networking.k8s.io/v1
   # modelServers: # REQUIRED
   #   matchLabels:
-  #     app: vllm-llama3-8b-instruct
+  #     app: vllm-qwen3-32b
 
   # Should only used if apiVersion is inference.networking.x-k8s.io/v1alpha2,
   # This will soon be deprecated when upstream GW providers support v1, just doing something simple for now.
 
@@ -18,7 +18,7 @@ inferenceExtension:
     # set it to false when you want to deploy EPP with inferencepool
     createInferencePool: true
     # Required when createInferencePool is false
-#    endpointSelector: app=vllm-llama3-8b-instruct
+#    endpointSelector: app=vllm-qwen3-32b
     # unused when createInferencePool is true
     targetPorts: 8000
     # unused when createInferencePool is true
 
@@ -1,12 +1,12 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferenceObjective
 metadata:
-  name: food-review
+  name: small-segment-lora
 spec:
   priority: 1
   poolRef:
     group: inference.networking.k8s.io
-    name: vllm-llama3-8b-instruct
+    name: vllm-qwen3-32b
 ---
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferenceObjective
@@ -16,7 +16,7 @@ spec:
   priority: 2
   poolRef:
     group: inference.networking.k8s.io
-    name: vllm-llama3-8b-instruct
+    name: vllm-qwen3-32b
 ---
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferenceObjective
@@ -26,4 +26,4 @@ spec:
   priority: 2
   poolRef:
     group: inference.networking.k8s.io
-    name: vllm-llama3-8b-instruct
+    name: vllm-qwen3-32b
@@ -1,26 +1,26 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: sgl-llama3-8b-instruct
+  name: sgl-qwen3-32b-instruct
   labels:
-    app: sgl-llama3-8b-instruct
+    app: sgl-qwen3-32b-instruct
 spec:
   replicas: 3
   selector:
     matchLabels:
-      app: sgl-llama3-8b-instruct
+      app: sgl-qwen3-32b-instruct
   template:
     metadata:
       labels:
-        app: sgl-llama3-8b-instruct
+        app: sgl-qwen3-32b-instruct
         inference.networking.k8s.io/engine-type: sglang
     spec:
       containers:
         - name: sglang
           image: lmsysorg/sglang:latest
           command: ["python3", "-m", "sglang.launch_server"]
           args:
-            - "--model-path=meta-llama/Llama-3.1-8B-Instruct"
+            - "--model-path=Qwen/Qwen3-32B"
             - "--host=0.0.0.0"
             - "--port=8000"
             - "--dtype=bfloat16"
 
@@ -1,16 +1,16 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: vllm-llama3-8b-instruct
+  name: vllm-qwen3-32b
 spec:
   replicas: 3
   selector:
     matchLabels:
-      app: vllm-llama3-8b-instruct
+      app: vllm-qwen3-32b
   template:
     metadata:
       labels:
-        app: vllm-llama3-8b-instruct
+        app: vllm-qwen3-32b
         inference.networking.k8s.io/engine-type: vllm
     spec:
       containers:
@@ -20,15 +20,15 @@ spec:
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args:
           - "--model"
-          - "Qwen/Qwen2.5-1.5B-Instruct"
+          - "Qwen/Qwen3-32B"
           - "--port"
           - "8000"
           - "--enable-lora"
           - "--max-loras"
           - "4"
           - "--lora-modules"
-          - '{"name": "food-review-0", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
-          - '{"name": "food-review-1", "path": "SriSanth2345/Qwen-1.5B-Tweet-Generations", "base_model_name": "Qwen/Qwen2.5-1.5B"}'
+          - '{"name": "small-segment-lora-0", "path": "ttt421/nec119-small-segment-lora", "base_model_name": "Qwen/Qwen3-32B"}'
+          - '{"name": "small-segment-lora-1", "path": "ttt421/nec119-small-segment-lora", "base_model_name": "Qwen/Qwen3-32B"}'
           env:
             - name: PORT
               value: "8000"
@@ -109,13 +109,13 @@ metadata:
 data:
   configmap.yaml: |
       vLLMLoRAConfig:
-        name: vllm-llama3-8b-instruct
+        name: vllm-qwen3-32b
         port: 8000
         ensureExist:
           models:
-          - base-model: Qwen/Qwen2.5-1.5B
-            id: food-review
-            source: SriSanth2345/Qwen-1.5B-Tweet-Generations
+          - base-model: Qwen/Qwen3-32B
+            id: small-segment-lora
+            source: ttt421/nec119-small-segment-lora
           - base-model: Qwen/Qwen2.5-1.5B
             id: cad-fabricator
-            source: SriSanth2345/Qwen-1.5B-Tweet-Generations
+            source: SriSanth2345/Qwen-1.5B-Tweet-Generations
@@ -1,16 +1,16 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: vllm-llama3-8b-instruct
+  name: vllm-qwen3-32b
 spec:
   replicas: 3
   selector:
     matchLabels:
-      app: vllm-llama3-8b-instruct
+      app: vllm-qwen3-32b
   template:
     metadata:
       labels:
-        app: vllm-llama3-8b-instruct
+        app: vllm-qwen3-32b
         inference.networking.k8s.io/engine-type: vllm
     spec:
       containers:
@@ -20,7 +20,7 @@ spec:
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args:
           - "--model"
-          - "meta-llama/Llama-3.1-8B-Instruct"
+          - "Qwen/Qwen3-32B"
           - "--tensor-parallel-size"
           - "1"
           - "--port"
@@ -239,19 +239,19 @@ spec:
           emptyDir: {}
         - name: config-volume
           configMap:
-            name: vllm-llama3-8b-instruct-adapters
+            name: vllm-qwen3-32b-adapters
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: vllm-llama3-8b-instruct-adapters
+  name: vllm-qwen3-32b-adapters
 data:
   configmap.yaml: |
       vLLMLoRAConfig:
-        name: vllm-llama3-8b-instruct-adapters
+        name: vllm-qwen3-32b-adapters
         port: 8000
-        defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct
+        defaultBaseModel: Qwen/Qwen3-32B
         ensureExist:
           models:
-          - id: food-review-1
-            source: Kawon/llama3.1-food-finetune_v14_r8
+          - id: small-segment-lora-1
+            source: ttt421/nec119-small-segment-lora
@@ -24,4 +24,4 @@ extraScrapeConfigs: |
     relabel_configs:
       - source_labels: [__meta_kubernetes_pod_label_app]
         action: keep
-        regex: vllm-llama3-8b-instruct
+        regex: vllm-qwen3-32b
@@ -212,10 +212,10 @@ const (
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: InferenceModelRewrite
 metadata:
-  name: food-review-canary-rollout
+  name: small-segment-lora-canary-rollout
 spec:
   poolRef:
-    name: main-food-review-pool
+    name: main-small-segment-lora-pool
   rules:
   - matches:
     - model:
 
@@ -218,7 +218,7 @@ func (s *StreamingServer) generateResponseHeaders(reqCtx *RequestContext) []*con
 }
 
 // Example message if "stream_options": {"include_usage": "true"} is included in the request:
-// data: {"id":"...","object":"text_completion","created":1739400043,"model":"food-review-0","choices":[],
+// data: {"id":"...","object":"text_completion","created":1739400043,"model":"small-segment-lora-0","choices":[],
 // "usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
 //
 // data: [DONE]
Original file line number	Diff line number	Diff line change
`@@ -218,7 +218,7 @@ func (s StreamingServer) generateResponseHeaders(reqCtx RequestContext) []*con`
`218`	`218`	`}`
`219`	`219`
`220`	`220`	`// Example message if "stream_options": {"include_usage": "true"} is included in the request:`
`221`		`-// data: {"id":"...","object":"text_completion","created":1739400043,"model":"food-review-0","choices":[],`
	`221`	`+// data: {"id":"...","object":"text_completion","created":1739400043,"model":"small-segment-lora-0","choices":[],`
`222`	`222`	`// "usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}`
`223`	`223`	`//`
`224`	`224`	`// data: [DONE]`