Skip to content

Commit 1eb7727

Browse files
committed
Support multi-host in Playground
Signed-off-by: kerthcet <[email protected]>
1 parent 3061a85 commit 1eb7727

File tree

32 files changed

+17611
-14905
lines changed

32 files changed

+17611
-14905
lines changed

api/core/v1alpha1/model_types.go

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,23 +100,24 @@ type Flavor struct {
100100
Name FlavorName `json:"name"`
101101
// Requests defines the required accelerators to serve the model for each replica,
102102
// like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
103-
// the resource requirements for each replica. This may change in the future.
103+
// the resource requirements for each replica, usually equals to the TP size.
104104
// Not recommended to set the cpu and memory usage here:
105105
// - if using playground, you can define the cpu/mem usage at backendConfig.
106106
// - if using inference service, you can define the cpu/mem at the container resources.
107107
// However, if you define the same accelerator requests at playground/service as well,
108-
// the requests here will be covered.
108+
// the requests will be overwritten by the flavor requests.
109109
// +optional
110110
Requests v1.ResourceList `json:"requests,omitempty"`
111111
// NodeSelector represents the node candidates for Pod placements, if a node doesn't
112112
// meet the nodeSelector, it will be filtered out in the resourceFungibility scheduler plugin.
113113
// If nodeSelector is empty, it means every node is a candidate.
114114
// +optional
115115
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
116-
// Params stores other useful parameters and will be consumed by the autoscaling components
117-
// like cluster-autoscaler, Karpenter.
118-
// E.g. when scaling up nodes with 8x Nvidia A00, the parameter can be injected with
119-
// instance-type: p4d.24xlarge for AWS.
116+
// Params stores other useful parameters and will be consumed by cluster-autoscaler / Karpenter
117+
// for autoscaling or be defined as model parallelism parameters like TP or PP size.
118+
// E.g. with autoscaling, when scaling up nodes with 8x Nvidia A00, the parameter can be injected
119+
// with <INSTANCE-TYPE: p4d.24xlarge> for AWS.
120+
// Preset parameters: TP, PP, INSTANCE-TYPE.
120121
// +optional
121122
Params map[string]string `json:"params,omitempty"`
122123
}

api/inference/v1alpha1/backendruntime_types.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ import (
2121
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2222
)
2323

24-
// BackendRuntimeArg is preset arguments for easy to use.
25-
// Do not edit the preset names unless set the argument name explicitly
26-
// in Playground backendRuntimeConfig.
24+
// BackendRuntimeArg is the preset arguments for easy to use.
25+
// Three preset names are provided: default, speculative-decoding, model-parallelism,
26+
// do not change the name.
2727
type BackendRuntimeArg struct {
2828
// Name represents the identifier of the backendRuntime argument.
2929
Name string `json:"name"`
@@ -32,11 +32,21 @@ type BackendRuntimeArg struct {
3232
Flags []string `json:"flags,omitempty"`
3333
}
3434

35+
// MultiHostCommands represents leader & worker commands for multiple nodes scenarios.
36+
type MultiHostCommands struct {
37+
Leader []string `json:"leader,omitempty"`
38+
Worker []string `json:"worker,omitempty"`
39+
}
40+
3541
// BackendRuntimeSpec defines the desired state of BackendRuntime
3642
type BackendRuntimeSpec struct {
37-
// Commands represents the default command of the backendRuntime.
43+
// Commands represents the default commands for the backendRuntime.
3844
// +optional
3945
Commands []string `json:"commands,omitempty"`
46+
// MultiHostCommands represents leader and worker commands for nodes with
47+
// different roles.
48+
// +optional
49+
MultiHostCommands *MultiHostCommands `json:"multiHostCommands,omitempty"`
4050
// Image represents the default image registry of the backendRuntime.
4151
// It will work together with version to make up a real image.
4252
Image string `json:"image"`

api/inference/v1alpha1/service_types.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ import (
2323
coreapi "github.com/inftyai/llmaz/api/core/v1alpha1"
2424
)
2525

26+
const (
27+
// InferenceServiceFlavorsAnnoKey is the annotation key for the flavors specified
28+
// in the inference service, the value is a comma-separated list of flavor names.
29+
InferenceServiceFlavorsAnnoKey = "llmaz.io/inference-service-flavors"
30+
)
31+
2632
// ServiceSpec defines the desired state of Service.
2733
// Service controller will maintain multi-flavor of workloads with
2834
// different accelerators for cost or performance considerations.

api/inference/v1alpha1/zz_generated.deepcopy.go

Lines changed: 30 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

chart/templates/backends/vllm.yaml

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,49 @@ spec:
1212
- python3
1313
- -m
1414
- vllm.entrypoints.openai.api_server
15+
multiHostCommands:
16+
leader:
17+
- sh
18+
- -c
19+
- |
20+
ray start --head --disable-usage-stats --include-dashboard false
21+
22+
i=0
23+
while true; do
24+
active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
25+
if [ $active_nodes -eq $(LWS_GROUP_SIZE) ]; then
26+
echo "All ray workers are active and the ray cluster is initialized successfully."
27+
break
28+
fi
29+
if [ $i -eq 60 ]; then
30+
echo "Initialization failed. Exiting..."
31+
exit 1
32+
fi
33+
echo "Wait for $active_nodes/$(LWS_GROUP_SIZE) workers to be active."
34+
i=$((i+1))
35+
sleep 5s;
36+
done
37+
38+
python3 -m vllm.entrypoints.openai.api_server
39+
worker:
40+
- sh
41+
- -c
42+
- |
43+
i=0
44+
while true; do
45+
ray start --address=$(LWS_LEADER_ADDRESS):6379 --block
46+
47+
if [ $? -eq 0 ]; then
48+
echo "Worker: Ray runtime started with head address $(LWS_LEADER_ADDRESS):6379"
49+
break
50+
fi
51+
if [ $i -eq 60 ]; then
52+
echo "Initialization failed. Exiting..."
53+
exit 1
54+
fi
55+
echo "Waiting until the ray worker is active..."
56+
sleep 5s;
57+
done
1558
image: vllm/vllm-openai
1659
version: v0.6.0
1760
# Do not edit the preset argument name unless you know what you're doing.
@@ -39,11 +82,24 @@ spec:
3982
- "0.0.0.0"
4083
- --port
4184
- "8080"
42-
- --use-v2-block-manager
4385
- --num_speculative_tokens
4486
- "5"
4587
- -tp
4688
- "1"
89+
- name: model-parallelism
90+
flags:
91+
- --model
92+
- "{{`{{ .ModelPath }}`}}"
93+
- --served-model-name
94+
- "{{`{{ .ModelName }}`}}"
95+
- --host
96+
- "0.0.0.0"
97+
- --port
98+
- "8080"
99+
- --tensor-parallel-size
100+
- "{{`{{ .TP }}`}}"
101+
- --pipeline-parallel-size
102+
- "{{`{{ .PP }}`}}"
47103
resources:
48104
requests:
49105
cpu: 4

chart/values.yaml

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
controllerManager:
22
kubeRbacProxy:
33
args:
4-
- --secure-listen-address=0.0.0.0:8443
5-
- --upstream=http://127.0.0.1:8080/
6-
- --logtostderr=true
7-
- --v=0
4+
- --secure-listen-address=0.0.0.0:8443
5+
- --upstream=http://127.0.0.1:8080/
6+
- --logtostderr=true
7+
- --v=0
88
containerSecurityContext:
99
allowPrivilegeEscalation: false
1010
capabilities:
1111
drop:
12-
- ALL
12+
- ALL
1313
image:
1414
repository: gcr.io/kubebuilder/kube-rbac-proxy
1515
tag: v0.15.0
@@ -22,15 +22,15 @@ controllerManager:
2222
memory: 64Mi
2323
manager:
2424
args:
25-
- --health-probe-bind-address=:8081
26-
- --metrics-bind-address=127.0.0.1:8080
27-
- --leader-elect
28-
- --namespace=llmaz-system
25+
- --health-probe-bind-address=:8081
26+
- --metrics-bind-address=127.0.0.1:8080
27+
- --leader-elect
28+
- --namespace=llmaz-system
2929
containerSecurityContext:
3030
allowPrivilegeEscalation: false
3131
capabilities:
3232
drop:
33-
- ALL
33+
- ALL
3434
image:
3535
repository: inftyai/llmaz
3636
tag: v0.0.9
@@ -47,14 +47,14 @@ controllerManager:
4747
kubernetesClusterDomain: cluster.local
4848
metricsService:
4949
ports:
50-
- name: https
51-
port: 8443
52-
protocol: TCP
53-
targetPort: https
50+
- name: https
51+
port: 8443
52+
protocol: TCP
53+
targetPort: https
5454
type: ClusterIP
5555
webhookService:
5656
ports:
57-
- port: 443
58-
protocol: TCP
59-
targetPort: 9443
57+
- port: 443
58+
protocol: TCP
59+
targetPort: 9443
6060
type: ClusterIP

config/crd/bases/inference.llmaz.io_backendruntimes.yaml

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,9 @@ spec:
4747
They can be appended or overwritten by the Playground backendRuntimeConfig.
4848
items:
4949
description: |-
50-
BackendRuntimeArg is preset arguments for easy to use.
51-
Do not edit the preset names unless set the argument name explicitly
52-
in Playground backendRuntimeConfig.
50+
BackendRuntimeArg is the preset arguments for easy to use.
51+
Three preset names are provided: default, speculative-decoding, model-parallelism,
52+
do not change the name.
5353
properties:
5454
flags:
5555
description: |-
@@ -67,7 +67,7 @@ spec:
6767
type: object
6868
type: array
6969
commands:
70-
description: Commands represents the default command of the backendRuntime.
70+
description: Commands represents the default commands for the backendRuntime.
7171
items:
7272
type: string
7373
type: array
@@ -194,6 +194,20 @@ spec:
194194
Image represents the default image registry of the backendRuntime.
195195
It will work together with version to make up a real image.
196196
type: string
197+
multiHostCommands:
198+
description: |-
199+
MultiHostCommands represents leader and worker commands for nodes with
200+
different roles.
201+
properties:
202+
leader:
203+
items:
204+
type: string
205+
type: array
206+
worker:
207+
items:
208+
type: string
209+
type: array
210+
type: object
197211
resources:
198212
description: |-
199213
Resources represents the resource requirements for backendRuntime, like cpu/mem,

config/crd/bases/inference.llmaz.io_services.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16378,6 +16378,9 @@ spec:
1637816378
description: |-
1637916379
SubdomainPolicy determines the policy that will be used when creating
1638016380
the headless service, defaults to shared
16381+
enum:
16382+
- Shared
16383+
- UniquePerReplica
1638116384
type: string
1638216385
required:
1638316386
- subdomainPolicy

config/crd/bases/llmaz.io_openmodels.yaml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,11 @@ spec:
7373
additionalProperties:
7474
type: string
7575
description: |-
76-
Params stores other useful parameters and will be consumed by the autoscaling components
77-
like cluster-autoscaler, Karpenter.
78-
E.g. when scaling up nodes with 8x Nvidia A00, the parameter can be injected with
79-
instance-type: p4d.24xlarge for AWS.
76+
Params stores other useful parameters and will be consumed by cluster-autoscaler / Karpenter
77+
for autoscaling or be defined as model parallelism parameters like TP or PP size.
78+
E.g. with autoscaling, when scaling up nodes with 8x Nvidia A00, the parameter can be injected
79+
with <INSTANCE-TYPE: p4d.24xlarge> for AWS.
80+
Preset parameters: TP, PP, INSTANCE-TYPE.
8081
type: object
8182
requests:
8283
additionalProperties:
@@ -88,12 +89,12 @@ spec:
8889
description: |-
8990
Requests defines the required accelerators to serve the model for each replica,
9091
like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
91-
the resource requirements for each replica. This may change in the future.
92+
the resource requirements for each replica, usually equals to the TP size.
9293
Not recommended to set the cpu and memory usage here:
9394
- if using playground, you can define the cpu/mem usage at backendConfig.
9495
- if using inference service, you can define the cpu/mem at the container resources.
9596
However, if you define the same accelerator requests at playground/service as well,
96-
the requests here will be covered.
97+
the requests will be overwritten by the flavor requests.
9798
type: object
9899
required:
99100
- name

config/manager/kustomization.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1
44
kind: Kustomization
55
images:
66
- name: controller
7-
newName: inftyai/llmaz
8-
newTag: v0.0.9
7+
newName: inftyai/test
8+
newTag: llmaz-011601

0 commit comments

Comments
 (0)