Jetstream + RayServe deployment for interleave mode (#146)

richardsliu · web-flow · commit 663c102b90c7 · 2024-07-10T21:04:29.000-07:00
* kuberay manifests and dockerfile

* sample ray_serve

* Single host interleave

* update image

* Gcsfuse and jax platform fix

* multihost

* Cleanup

* Cleanup

* Parameterize tpu head type

* Format

* revert

* revert

* update readme

* fix format

* lint
diff --git a/README.md b/README.md
@@ -199,6 +199,92 @@ python benchmarks/benchmark_serving.py --tokenizer $tokenizer_path --num-prompts
 Please look at `deps/JetStream/benchmarks/README.md` for more information.
 
 
+
+## Run server with Ray Serve
+
+### Prerequisites
+
+If running on GKE:
+
+1. Follow instructions on [this link](https://github.com/GoogleCloudPlatform/ai-on-gke/tree/main/ray-on-gke/guides/tpu) to setup a GKE cluster and the TPU webhook.
+2. Follow instructions
+   [here](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/cloud-storage-fuse-csi-driver)
+   to enable GCSFuse for your cluster. This will be needed to store the
+   converted weights.
+3. Deploy one of the sample Kuberay cluster configurations:
+```bash
+kubectl apply -f kuberay/manifests/ray-cluster.tpu-v4-singlehost.yaml
+```
+or
+```bash
+kubectl apply -f kuberay/manifests/ray-cluster.tpu-v4-multihost.yaml
+```
+
+
+### Start a Ray Serve deployment
+
+Single-host (Llama2 7B):
+
+```bash
+export RAY_ADDRESS=http://localhost:8265
+
+kubectl port-forward svc/example-cluster-kuberay-head-svc 8265:8265 &
+
+ray job submit --runtime-env-json='{"working_dir": "."}' -- python run_ray_serve_interleave.py  --tpu_chips=4 --num_hosts=1 --size=7b --model_name=llama-2 --batch_size=32 --max_cache_length=2048 --tokenizer_path=/llama/tokenizer.model --checkpoint_path=/llama/ckpt --quantize_weights=True --quantize_type="int8_per_channel" --quantize_kv_cache=True --sharding_config="default_shardings/llama.yaml"
+```
+
+Multi-host (Llama2 70B):
+
+```bash
+export RAY_ADDRESS=http://localhost:8265
+
+kubectl port-forward svc/example-cluster-kuberay-head-svc 8265:8265 &
+
+ray job submit --runtime-env-json='{"working_dir": "."}' -- python run_ray_serve_interleave.py  --tpu_chips=8 --num_hosts=2 --size=70b --model_name=llama-2 --batch_size=8 --max_cache_length=2048 --tokenizer_path=/llama/tokenizer.model --checkpoint_path=/llama/ckpt --quantize_weights=True --quantize_type="int8_per_channel" --quantize_kv_cache=True --sharding_config="default_shardings/llama.yaml"
+```
+
+### Sending an inference request
+
+Port-forward to port 8888 for gRPC:
+```
+kubectl port-forward svc/example-cluster-kuberay-head-svc 8888:8888 &
+```
+
+Sample python script:
+
+```python
+import requests
+import os
+import grpc
+
+from jetstream.core.proto import jetstream_pb2
+from jetstream.core.proto import jetstream_pb2_grpc
+
+prompt = "What are the top 5 languages?"
+
+channel = grpc.insecure_channel("localhost:8888")
+stub = jetstream_pb2_grpc.OrchestratorStub(channel)
+
+request = jetstream_pb2.DecodeRequest(
+    text_content=jetstream_pb2.DecodeRequest.TextContent(
+        text=prompt
+    ),
+    priority=0,
+    max_tokens=2000,
+)
+
+response = stub.Decode(request)
+output = []
+for resp in response:
+  output.extend(resp.stream_content.samples[0].text)
+
+text_output = "".join(output)
+print(f"Prompt: {prompt}")
+print(f"Response: {text_output}")
+```
+
+
+
 # Typical Errors
 
 ## Unexpected keyword argument 'device'
diff --git a/kuberay/image/Dockerfile b/kuberay/image/Dockerfile
@@ -0,0 +1,16 @@
+FROM rayproject/ray:2.22.0-py310
+
+RUN pip install flax==0.8.3
+RUN pip install jax[tpu]==0.4.30 -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+RUN pip install tensorflow-text
+RUN pip install tensorflow
+
+RUN pip install torch==2.3.1+cpu --index-url https://download.pytorch.org/whl/cpu
+RUN pip install tensorflow flatbuffers absl-py sentencepiece seqio google-cloud-storage
+RUN pip install safetensors colorama coverage humanize
+
+RUN git clone https://github.com/google/jetstream-pytorch
+WORKDIR jetstream-pytorch
+
+RUN git submodule update --init --recursive
+RUN pip install -e .
diff --git a/kuberay/manifests/ray-cluster.tpu-v4-multihost.yaml b/kuberay/manifests/ray-cluster.tpu-v4-multihost.yaml
@@ -0,0 +1,144 @@
+# This template contains a Kuberay cluster using a 2x2x2 TPU v4 PodSlice.
+# To get access to TPU resources, please follow instructions in this link:
+# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: example-cluster-kuberay
+spec:
+  headGroupSpec:
+    rayStartParams:
+      {}
+    template:
+      spec:
+        imagePullSecrets:
+          []
+        serviceAccountName: ray-ksa
+        containers:
+          - volumeMounts:
+            - name: gcs-fuse-checkpoint
+              mountPath: /llama
+              readOnly: true
+            - mountPath: /tmp/ray
+              name: ray-logs
+            name: ray-head
+            image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240709
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "4"
+                ephemeral-storage: 30Gi
+                memory: 40G
+              requests:
+                cpu: "4"
+                ephemeral-storage: 30Gi
+                memory: 40G
+            securityContext:
+              {}
+            env:
+              - name: JAX_PLATFORMS
+                value: "cpu"
+              - name: RAY_memory_monitor_refresh_ms
+                value: "0"
+              - name: RAY_GRAFANA_IFRAME_HOST
+                value: http://${grafana_host}
+              - name: RAY_GRAFANA_HOST
+                value: http://grafana:80
+              - name: RAY_PROMETHEUS_HOST
+                value: http://frontend:9090
+            ports:
+              - containerPort: 6379
+                name: gcs
+              - containerPort: 8265
+                name: dashboard
+              - containerPort: 10001
+                name: client
+              - containerPort: 8000
+                name: serve
+              - containerPort: 8471
+                name: slicebuilder
+              - containerPort: 8081
+                name: mxla
+              - containerPort: 8888
+                name: grpc
+        volumes:
+          - emptyDir: {}
+            name: ray-logs
+          - name: gcs-fuse-checkpoint
+            csi:
+              driver: gcsfuse.csi.storage.gke.io
+              readOnly: true
+              volumeAttributes:
+                bucketName: ricliu-llama2-70b-chat
+                mountOptions: "implicit-dirs"
+      metadata:
+        annotations:
+          gke-gcsfuse/volumes: "true"
+        labels:
+          cloud.google.com/gke-ray-node-type: head
+          app.kubernetes.io/name: kuberay
+          app.kubernetes.io/instance: example-cluster
+
+  workerGroupSpecs:
+  - rayStartParams:
+      {}
+    replicas: 1
+    minReplicas: 1
+    maxReplicas: 1
+    numOfHosts: 2
+    groupName: workergroup
+    template:
+      spec:
+        imagePullSecrets:
+          []
+        serviceAccountName: ray-ksa
+        containers:
+          - volumeMounts:
+            - mountPath: /tmp/ray
+              name: ray-logs
+            - name: gcs-fuse-checkpoint
+              mountPath: /llama
+              readOnly: true
+            name: ray-worker
+            image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240709
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 30Gi
+                google.com/tpu: "4"
+                memory: 200G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 30Gi
+                google.com/tpu: "4"
+                memory: 200G
+            securityContext:
+              {}
+            env:
+            - name: JAX_PLATFORMS
+              value: "cpu"
+            ports:
+              null
+        volumes:
+          - emptyDir: {}
+            name: ray-logs
+          - name: gcs-fuse-checkpoint
+            csi:
+              driver: gcsfuse.csi.storage.gke.io
+              readOnly: true
+              volumeAttributes:
+                bucketName: ricliu-llama2-70b-chat
+                mountOptions: "implicit-dirs"
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
+          cloud.google.com/gke-tpu-topology: 2x2x2
+          iam.gke.io/gke-metadata-server-enabled: "true"
+      metadata:
+        annotations:
+          gke-gcsfuse/volumes: "true"
+        labels:
+          cloud.google.com/gke-ray-node-type: worker
+          app.kubernetes.io/name: kuberay
+          app.kubernetes.io/instance: example-cluster
+
diff --git a/kuberay/manifests/ray-cluster.tpu-v4-singlehost.yaml b/kuberay/manifests/ray-cluster.tpu-v4-singlehost.yaml
@@ -0,0 +1,140 @@
+# This template contains a Kuberay cluster using a 2x2x1 TPU v4 PodSlice.
+# To get access to TPU resources, please follow instructions in this link:
+# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: example-cluster-kuberay
+spec:
+  headGroupSpec:
+    rayStartParams:
+      {}
+    template:
+      spec:
+        imagePullSecrets:
+          []
+        serviceAccountName: ray-ksa
+        containers:
+          - volumeMounts:
+            - name: gcs-fuse-checkpoint
+              mountPath: /llama
+              readOnly: true
+            - mountPath: /tmp/ray
+              name: ray-logs
+            name: ray-head
+            image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240709
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "4"
+                ephemeral-storage: 30Gi
+                memory: 40G
+              requests:
+                cpu: "4"
+                ephemeral-storage: 30Gi
+                memory: 40G
+            securityContext:
+              {}
+            env:
+              - name: JAX_PLATFORMS
+                value: "cpu"
+              - name: RAY_memory_monitor_refresh_ms
+                value: "0"
+              - name: RAY_GRAFANA_IFRAME_HOST
+                value: http://${grafana_host}
+              - name: RAY_GRAFANA_HOST
+                value: http://grafana:80
+              - name: RAY_PROMETHEUS_HOST
+                value: http://frontend:9090
+            ports:
+              - containerPort: 6379
+                name: gcs
+              - containerPort: 8265
+                name: dashboard
+              - containerPort: 10001
+                name: client
+              - containerPort: 8000
+                name: serve
+              - containerPort: 8888
+                name: grpc
+        volumes:
+          - emptyDir: {}
+            name: ray-logs
+          - name: gcs-fuse-checkpoint
+            csi:
+              driver: gcsfuse.csi.storage.gke.io
+              readOnly: true
+              volumeAttributes:
+                bucketName: ricliu-llama2
+                mountOptions: "implicit-dirs"
+      metadata:
+        annotations:
+          gke-gcsfuse/volumes: "true"
+        labels:
+          cloud.google.com/gke-ray-node-type: head
+          app.kubernetes.io/name: kuberay
+          app.kubernetes.io/instance: example-cluster
+
+  workerGroupSpecs:
+  - rayStartParams:
+      {}
+    replicas: 1
+    minReplicas: 1
+    maxReplicas: 1
+    numOfHosts: 1
+    groupName: workergroup
+    template:
+      spec:
+        imagePullSecrets:
+          []
+        serviceAccountName: ray-ksa
+        containers:
+          - volumeMounts:
+            - mountPath: /tmp/ray
+              name: ray-logs
+            - name: gcs-fuse-checkpoint
+              mountPath: /llama
+              readOnly: true
+            name: ray-worker
+            image: gcr.io/tpu-vm-gke-testing/ricliu-jetstream:20240709
+            imagePullPolicy: IfNotPresent
+            resources:
+              limits:
+                cpu: "8"
+                ephemeral-storage: 30Gi
+                google.com/tpu: "4"
+                memory: 200G
+              requests:
+                cpu: "8"
+                ephemeral-storage: 30Gi
+                google.com/tpu: "4"
+                memory: 200G
+            securityContext:
+              {}
+            env:
+            - name: JAX_PLATFORMS
+              value: "cpu"
+            ports:
+              null
+        volumes:
+          - emptyDir: {}
+            name: ray-logs
+          - name: gcs-fuse-checkpoint
+            csi:
+              driver: gcsfuse.csi.storage.gke.io
+              readOnly: true
+              volumeAttributes:
+                bucketName: ricliu-llama2 
+                mountOptions: "implicit-dirs"
+        nodeSelector:
+          cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
+          cloud.google.com/gke-tpu-topology: 2x2x1
+          iam.gke.io/gke-metadata-server-enabled: "true"
+      metadata:
+        annotations:
+          gke-gcsfuse/volumes: "true"
+        labels:
+          cloud.google.com/gke-ray-node-type: worker
+          app.kubernetes.io/name: kuberay
+          app.kubernetes.io/instance: example-cluster
+
diff --git a/run_ray_serve_interleave.py b/run_ray_serve_interleave.py