Skip to content

Update Ray version in Dockerfile and add v5 configs #161

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion kuberay/image/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM rayproject/ray:2.22.0-py310
FROM rayproject/ray:2.32.0-py310

RUN pip install flax==0.8.3
RUN pip install jax[tpu]==0.4.30 -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
Expand Down
4 changes: 2 additions & 2 deletions kuberay/manifests/ray-cluster.tpu-v4-multihost.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ spec:
- mountPath: /tmp/ray
name: ray-logs
name: ray-head
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240709
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
Expand Down Expand Up @@ -100,7 +100,7 @@ spec:
mountPath: /llama
readOnly: true
name: ray-worker
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240709
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
Expand Down
4 changes: 2 additions & 2 deletions kuberay/manifests/ray-cluster.tpu-v4-singlehost.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ spec:
- mountPath: /tmp/ray
name: ray-logs
name: ray-head
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240709
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
Expand Down Expand Up @@ -96,7 +96,7 @@ spec:
mountPath: /llama
readOnly: true
name: ray-worker
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240709
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
Expand Down
144 changes: 144 additions & 0 deletions kuberay/manifests/ray-cluster.tpu-v5-multihost.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# This template contains a Kuberay cluster using a 2x2x2 TPU v4 PodSlice.
# To get access to TPU resources, please follow instructions in this link:
# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: example-cluster-kuberay
spec:
headGroupSpec:
rayStartParams:
{}
template:
spec:
imagePullSecrets:
[]
serviceAccountName: ray-ksa
containers:
- volumeMounts:
- name: gcs-fuse-checkpoint
mountPath: /llama
readOnly: true
- mountPath: /tmp/ray
name: ray-logs
name: ray-head
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "4"
ephemeral-storage: 30Gi
memory: 40G
requests:
cpu: "4"
ephemeral-storage: 30Gi
memory: 40G
securityContext:
{}
env:
- name: JAX_PLATFORMS
value: "cpu"
- name: RAY_memory_monitor_refresh_ms
value: "0"
- name: RAY_GRAFANA_IFRAME_HOST
value: http://${grafana_host}
- name: RAY_GRAFANA_HOST
value: http://grafana:80
- name: RAY_PROMETHEUS_HOST
value: http://frontend:9090
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
- containerPort: 8471
name: slicebuilder
- containerPort: 8081
name: mxla
- containerPort: 8888
name: grpc
volumes:
- emptyDir: {}
name: ray-logs
- name: gcs-fuse-checkpoint
csi:
driver: gcsfuse.csi.storage.gke.io
readOnly: true
volumeAttributes:
bucketName: ricliu-llama2-70b-chat
mountOptions: "implicit-dirs"
metadata:
annotations:
gke-gcsfuse/volumes: "true"
labels:
cloud.google.com/gke-ray-node-type: head
app.kubernetes.io/name: kuberay
app.kubernetes.io/instance: example-cluster

workerGroupSpecs:
- rayStartParams:
{}
replicas: 1
minReplicas: 1
maxReplicas: 1
numOfHosts: 2
groupName: workergroup
template:
spec:
imagePullSecrets:
[]
serviceAccountName: ray-ksa
containers:
- volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
- name: gcs-fuse-checkpoint
mountPath: /llama
readOnly: true
name: ray-worker
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "8"
ephemeral-storage: 30Gi
google.com/tpu: "4"
memory: 180G
requests:
cpu: "8"
ephemeral-storage: 30Gi
google.com/tpu: "4"
memory: 180G
securityContext:
{}
env:
- name: JAX_PLATFORMS
value: "cpu"
ports:
null
volumes:
- emptyDir: {}
name: ray-logs
- name: gcs-fuse-checkpoint
csi:
driver: gcsfuse.csi.storage.gke.io
readOnly: true
volumeAttributes:
bucketName: ricliu-llama2-70b-chat
mountOptions: "implicit-dirs"
nodeSelector:
iam.gke.io/gke-metadata-server-enabled: "true"
cloud.google.com/gke-tpu-topology: 2x4
cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
metadata:
annotations:
gke-gcsfuse/volumes: "true"
labels:
cloud.google.com/gke-ray-node-type: worker
app.kubernetes.io/name: kuberay
app.kubernetes.io/instance: example-cluster

140 changes: 140 additions & 0 deletions kuberay/manifests/ray-cluster.tpu-v5-singlehost.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# This template contains a Kuberay cluster using a 2x2x1 TPU v4 PodSlice.
# To get access to TPU resources, please follow instructions in this link:
# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: example-cluster-kuberay
spec:
headGroupSpec:
rayStartParams:
{}
template:
spec:
imagePullSecrets:
[]
serviceAccountName: ray-ksa
containers:
- volumeMounts:
- name: gcs-fuse-checkpoint
mountPath: /llama
readOnly: true
- mountPath: /tmp/ray
name: ray-logs
name: ray-head
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "4"
ephemeral-storage: 30Gi
memory: 40G
requests:
cpu: "4"
ephemeral-storage: 30Gi
memory: 40G
securityContext:
{}
env:
- name: JAX_PLATFORMS
value: "cpu"
- name: RAY_memory_monitor_refresh_ms
value: "0"
- name: RAY_GRAFANA_IFRAME_HOST
value: http://${grafana_host}
- name: RAY_GRAFANA_HOST
value: http://grafana:80
- name: RAY_PROMETHEUS_HOST
value: http://frontend:9090
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
- containerPort: 8888
name: grpc
volumes:
- emptyDir: {}
name: ray-logs
- name: gcs-fuse-checkpoint
csi:
driver: gcsfuse.csi.storage.gke.io
readOnly: true
volumeAttributes:
bucketName: ricliu-llama2
mountOptions: "implicit-dirs"
metadata:
annotations:
gke-gcsfuse/volumes: "true"
labels:
cloud.google.com/gke-ray-node-type: head
app.kubernetes.io/name: kuberay
app.kubernetes.io/instance: example-cluster

workerGroupSpecs:
- rayStartParams:
{}
replicas: 1
minReplicas: 1
maxReplicas: 1
numOfHosts: 1
groupName: workergroup
template:
spec:
imagePullSecrets:
[]
serviceAccountName: ray-ksa
containers:
- volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
- name: gcs-fuse-checkpoint
mountPath: /llama
readOnly: true
name: ray-worker
image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240729
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "8"
ephemeral-storage: 30Gi
google.com/tpu: "8"
memory: 200G
requests:
cpu: "8"
ephemeral-storage: 30Gi
google.com/tpu: "8"
memory: 200G
securityContext:
{}
env:
- name: JAX_PLATFORMS
value: "cpu"
ports:
null
volumes:
- emptyDir: {}
name: ray-logs
- name: gcs-fuse-checkpoint
csi:
driver: gcsfuse.csi.storage.gke.io
readOnly: true
volumeAttributes:
bucketName: ricliu-llama2
mountOptions: "implicit-dirs"
nodeSelector:
cloud.google.com/gke-tpu-topology: 2x4
cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
iam.gke.io/gke-metadata-server-enabled: "true"
metadata:
annotations:
gke-gcsfuse/volumes: "true"
labels:
cloud.google.com/gke-ray-node-type: worker
app.kubernetes.io/name: kuberay
app.kubernetes.io/instance: example-cluster

Loading