InftyAI
diff --git a/‎api/core/v1alpha1/model_types.go‎
Lines changed: 7 additions & 6 deletions b/‎api/core/v1alpha1/model_types.go‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎api/inference/v1alpha1/backendruntime_types.go‎
Lines changed: 14 additions & 4 deletions b/‎api/inference/v1alpha1/backendruntime_types.go‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎api/inference/v1alpha1/service_types.go‎
Lines changed: 6 additions & 0 deletions b/‎api/inference/v1alpha1/service_types.go‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎api/inference/v1alpha1/zz_generated.deepcopy.go‎
Lines changed: 30 additions & 0 deletions b/‎api/inference/v1alpha1/zz_generated.deepcopy.go‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎chart/templates/backends/vllm.yaml‎
Lines changed: 57 additions & 1 deletion b/‎chart/templates/backends/vllm.yaml‎
Lines changed: 57 additions & 1 deletion
diff --git a/‎chart/values.yaml‎
Lines changed: 17 additions & 17 deletions b/‎chart/values.yaml‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎config/crd/bases/inference.llmaz.io_backendruntimes.yaml‎
Lines changed: 18 additions & 4 deletions b/‎config/crd/bases/inference.llmaz.io_backendruntimes.yaml‎
Lines changed: 18 additions & 4 deletions
diff --git a/‎config/crd/bases/inference.llmaz.io_services.yaml‎
Lines changed: 3 additions & 0 deletions b/‎config/crd/bases/inference.llmaz.io_services.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎config/crd/bases/llmaz.io_openmodels.yaml‎
Lines changed: 7 additions & 6 deletions b/‎config/crd/bases/llmaz.io_openmodels.yaml‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎docs/examples/README.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/examples/README.md‎
Lines changed: 5 additions & 0 deletions
@@ -100,23 +100,24 @@ type Flavor struct {
 	Name FlavorName `json:"name"`
 	// Requests defines the required accelerators to serve the model for each replica,
 	// like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
-	// the resource requirements for each replica. This may change in the future.
+	// the resource requirements for each replica, usually equals to the TP size.
 	// Not recommended to set the cpu and memory usage here:
 	// - if using playground, you can define the cpu/mem usage at backendConfig.
 	// - if using inference service, you can define the cpu/mem at the container resources.
 	// However, if you define the same accelerator requests at playground/service as well,
-	// the requests here will be covered.
+	// the requests will be overwritten by the flavor requests.
 	// +optional
 	Requests v1.ResourceList `json:"requests,omitempty"`
 	// NodeSelector represents the node candidates for Pod placements, if a node doesn't
 	// meet the nodeSelector, it will be filtered out in the resourceFungibility scheduler plugin.
 	// If nodeSelector is empty, it means every node is a candidate.
 	// +optional
 	NodeSelector map[string]string `json:"nodeSelector,omitempty"`
-	// Params stores other useful parameters and will be consumed by the autoscaling components
-	// like cluster-autoscaler, Karpenter.
-	// E.g. when scaling up nodes with 8x Nvidia A00, the parameter can be injected with
-	// instance-type: p4d.24xlarge for AWS.
+	// Params stores other useful parameters and will be consumed by cluster-autoscaler / Karpenter
+	// for autoscaling or be defined as model parallelism parameters like TP or PP size.
+	// E.g. with autoscaling, when scaling up nodes with 8x Nvidia A00, the parameter can be injected
+	// with <INSTANCE-TYPE: p4d.24xlarge> for AWS.
+	// Preset parameters: TP, PP, INSTANCE-TYPE.
 	// +optional
 	Params map[string]string `json:"params,omitempty"`
 }
 
@@ -21,9 +21,9 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
-// BackendRuntimeArg is preset arguments for easy to use.
-// Do not edit the preset names unless set the argument name explicitly
-// in Playground backendRuntimeConfig.
+// BackendRuntimeArg is the preset arguments for easy to use.
+// Three preset names are provided: default, speculative-decoding, model-parallelism,
+// do not change the name.
 type BackendRuntimeArg struct {
 	// Name represents the identifier of the backendRuntime argument.
 	Name string `json:"name"`
@@ -32,11 +32,21 @@ type BackendRuntimeArg struct {
 	Flags []string `json:"flags,omitempty"`
 }
 
+// MultiHostCommands represents leader & worker commands for multiple nodes scenarios.
+type MultiHostCommands struct {
+	Leader []string `json:"leader,omitempty"`
+	Worker []string `json:"worker,omitempty"`
+}
+
 // BackendRuntimeSpec defines the desired state of BackendRuntime
 type BackendRuntimeSpec struct {
-	// Commands represents the default command of the backendRuntime.
+	// Commands represents the default commands for the backendRuntime.
 	// +optional
 	Commands []string `json:"commands,omitempty"`
+	// MultiHostCommands represents leader and worker commands for nodes with
+	// different roles.
+	// +optional
+	MultiHostCommands *MultiHostCommands `json:"multiHostCommands,omitempty"`
 	// Image represents the default image registry of the backendRuntime.
 	// It will work together with version to make up a real image.
 	Image string `json:"image"`
 
@@ -23,6 +23,12 @@ import (
 	coreapi "github.com/inftyai/llmaz/api/core/v1alpha1"
 )
 
+const (
+	// InferenceServiceFlavorsAnnoKey is the annotation key for the flavors specified
+	// in the inference service, the value is a comma-separated list of flavor names.
+	InferenceServiceFlavorsAnnoKey = "llmaz.io/inference-service-flavors"
+)
+
 // ServiceSpec defines the desired state of Service.
 // Service controller will maintain multi-flavor of workloads with
 // different accelerators for cost or performance considerations.
 
@@ -12,6 +12,49 @@ spec:
     - python3
     - -m
     - vllm.entrypoints.openai.api_server
+  multiHostCommands:
+    leader:
+      - sh
+      - -c
+      - |
+        ray start --head --disable-usage-stats --include-dashboard false
+
+        i=0
+        while true; do
+          active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
+          if [ $active_nodes -eq $(LWS_GROUP_SIZE) ]; then
+            echo "All ray workers are active and the ray cluster is initialized successfully."
+            break
+          fi
+          if [ $i -eq 60 ]; then
+            echo "Initialization failed. Exiting..."
+            exit 1
+          fi
+          echo "Wait for $active_nodes/$(LWS_GROUP_SIZE) workers to be active."
+          i=$((i+1))
+          sleep 5s;
+        done
+
+        python3 -m vllm.entrypoints.openai.api_server
+    worker:
+      - sh
+      - -c
+      - |
+        i=0
+        while true; do
+          ray start --address=$(LWS_LEADER_ADDRESS):6379 --block
+
+          if [ $? -eq 0 ]; then
+            echo "Worker: Ray runtime started with head address $(LWS_LEADER_ADDRESS):6379"
+            break
+          fi
+          if [ $i -eq 60 ]; then
+            echo "Initialization failed. Exiting..."
+            exit 1
+          fi
+          echo "Waiting until the ray worker is active..."
+          sleep 5s;
+        done
   image: vllm/vllm-openai
   version: v0.6.0
   # Do not edit the preset argument name unless you know what you're doing.
@@ -39,11 +82,24 @@ spec:
         - "0.0.0.0"
         - --port
         - "8080"
-        - --use-v2-block-manager
         - --num_speculative_tokens
         - "5"
         - -tp
         - "1"
+    - name: model-parallelism
+      flags:
+        - --model
+        - "{{`{{ .ModelPath }}`}}"
+        - --served-model-name
+        - "{{`{{ .ModelName }}`}}"
+        - --host
+        - "0.0.0.0"
+        - --port
+        - "8080"
+        - --tensor-parallel-size
+        - "{{`{{ .TP }}`}}"
+        - --pipeline-parallel-size
+        - "{{`{{ .PP }}`}}"
   resources:
     requests:
       cpu: 4
 
@@ -1,15 +1,15 @@
 controllerManager:
   kubeRbacProxy:
     args:
-    - --secure-listen-address=0.0.0.0:8443
-    - --upstream=http://127.0.0.1:8080/
-    - --logtostderr=true
-    - --v=0
+      - --secure-listen-address=0.0.0.0:8443
+      - --upstream=http://127.0.0.1:8080/
+      - --logtostderr=true
+      - --v=0
     containerSecurityContext:
       allowPrivilegeEscalation: false
       capabilities:
         drop:
-        - ALL
+          - ALL
     image:
       repository: gcr.io/kubebuilder/kube-rbac-proxy
       tag: v0.15.0
@@ -22,15 +22,15 @@ controllerManager:
         memory: 64Mi
   manager:
     args:
-    - --health-probe-bind-address=:8081
-    - --metrics-bind-address=127.0.0.1:8080
-    - --leader-elect
-    - --namespace=llmaz-system
+      - --health-probe-bind-address=:8081
+      - --metrics-bind-address=127.0.0.1:8080
+      - --leader-elect
+      - --namespace=llmaz-system
     containerSecurityContext:
       allowPrivilegeEscalation: false
       capabilities:
         drop:
-        - ALL
+          - ALL
     image:
       repository: inftyai/llmaz
       tag: v0.0.9
@@ -47,14 +47,14 @@ controllerManager:
 kubernetesClusterDomain: cluster.local
 metricsService:
   ports:
-  - name: https
-    port: 8443
-    protocol: TCP
-    targetPort: https
+    - name: https
+      port: 8443
+      protocol: TCP
+      targetPort: https
   type: ClusterIP
 webhookService:
   ports:
-  - port: 443
-    protocol: TCP
-    targetPort: 9443
+    - port: 443
+      protocol: TCP
+      targetPort: 9443
   type: ClusterIP
@@ -47,9 +47,9 @@ spec:
                   They can be appended or overwritten by the Playground backendRuntimeConfig.
                 items:
                   description: |-
-                    BackendRuntimeArg is preset arguments for easy to use.
-                    Do not edit the preset names unless set the argument name explicitly
-                    in Playground backendRuntimeConfig.
+                    BackendRuntimeArg is the preset arguments for easy to use.
+                    Three preset names are provided: default, speculative-decoding, model-parallelism,
+                    do not change the name.
                   properties:
                     flags:
                       description: |-
@@ -67,7 +67,7 @@ spec:
                   type: object
                 type: array
               commands:
-                description: Commands represents the default command of the backendRuntime.
+                description: Commands represents the default commands for the backendRuntime.
                 items:
                   type: string
                 type: array
@@ -194,6 +194,20 @@ spec:
                   Image represents the default image registry of the backendRuntime.
                   It will work together with version to make up a real image.
                 type: string
+              multiHostCommands:
+                description: |-
+                  MultiHostCommands represents leader and worker commands for nodes with
+                  different roles.
+                properties:
+                  leader:
+                    items:
+                      type: string
+                    type: array
+                  worker:
+                    items:
+                      type: string
+                    type: array
+                type: object
               resources:
                 description: |-
                   Resources represents the resource requirements for backendRuntime, like cpu/mem,
 
@@ -16378,6 +16378,9 @@ spec:
                         description: |-
                           SubdomainPolicy determines the policy that will be used when creating
                           the headless service, defaults to shared
+                        enum:
+                        - Shared
+                        - UniquePerReplica
                         type: string
                     required:
                     - subdomainPolicy
 
@@ -73,10 +73,11 @@ spec:
                       additionalProperties:
                         type: string
                       description: |-
-                        Params stores other useful parameters and will be consumed by the autoscaling components
-                        like cluster-autoscaler, Karpenter.
-                        E.g. when scaling up nodes with 8x Nvidia A00, the parameter can be injected with
-                        instance-type: p4d.24xlarge for AWS.
+                        Params stores other useful parameters and will be consumed by cluster-autoscaler / Karpenter
+                        for autoscaling or be defined as model parallelism parameters like TP or PP size.
+                        E.g. with autoscaling, when scaling up nodes with 8x Nvidia A00, the parameter can be injected
+                        with <INSTANCE-TYPE: p4d.24xlarge> for AWS.
+                        Preset parameters: TP, PP, INSTANCE-TYPE.
                       type: object
                     requests:
                       additionalProperties:
@@ -88,12 +89,12 @@ spec:
                       description: |-
                         Requests defines the required accelerators to serve the model for each replica,
                         like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
-                        the resource requirements for each replica. This may change in the future.
+                        the resource requirements for each replica, usually equals to the TP size.
                         Not recommended to set the cpu and memory usage here:
                         - if using playground, you can define the cpu/mem usage at backendConfig.
                         - if using inference service, you can define the cpu/mem at the container resources.
                         However, if you define the same accelerator requests at playground/service as well,
-                        the requests here will be covered.
+                        the requests will be overwritten by the flavor requests.
                       type: object
                   required:
                   - name
 
@@ -12,6 +12,7 @@ We provide a set of examples to help you serve large language models, by default
 - [Deploy models via text-generation-inference](#deploy-models-via-tgi)
 - [Deploy models via ollama](#ollama)
 - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm)
+- [Deploy multi-host inference](#multi-host-inference)
 
 ### Deploy models from Huggingface
 
@@ -54,3 +55,7 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference
 ### Speculative Decoding with vLLM
 
 [Speculative Decoding](https://arxiv.org/abs/2211.17192) can improve inference performance efficiently, see [example](./speculative-decoding/vllm/) here.
+
+### Multi-Host Inference
+
+Model size is growing bigger and bigger, Llama 3.1 405B FP16 LLM requires more than 750 GB GPU for weights only, leaving kv cache unconsidered, even with 8 x H100 Nvidia GPUs, 80 GB size of HBM each, can not fit in a single host, requires a multi-host deployment, see [example](./multi-nodes/) here.