Skip to content

Commit bf8f87f

Browse files
authored
Merge branch 'master' into support-for-gpu-cluster-using-oci-runner
Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com>
2 parents 37ba59b + 4443f79 commit bf8f87f

27 files changed

Lines changed: 2257 additions & 863 deletions

File tree

.github/workflows/build-and-push-images.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ on:
44
push:
55
branches:
66
- master
7-
- 'release-*'
7+
- "release-*"
88
tags:
9-
- 'v*'
9+
- "v*"
1010
pull_request:
1111

1212
jobs:
@@ -34,9 +34,10 @@ jobs:
3434
- component-name: deepspeed-runtime
3535
dockerfile: cmd/runtimes/deepspeed/Dockerfile
3636
platforms: linux/amd64,linux/arm64
37+
# TODO (andreyvelich): mlx[cuda] doesn't support arm at the moment: https://github.com/ml-explore/mlx/issues/2469
3738
- component-name: mlx-runtime
3839
dockerfile: cmd/runtimes/mlx/Dockerfile
39-
platforms: linux/arm64
40+
platforms: linux/amd64
4041
- component-name: torchtune-trainer
4142
dockerfile: cmd/trainers/torchtune/Dockerfile
4243
platforms: linux/amd64,linux/arm64

.github/workflows/test-go.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ jobs:
88
generate:
99
name: Generate
1010
runs-on: ubuntu-latest
11+
if: ${{ github.repository == 'kubeflow/trainer' }}
1112
env:
1213
GOPATH: ${{ github.workspace }}/go
1314
defaults:
@@ -47,6 +48,7 @@ jobs:
4748
test:
4849
name: Test
4950
runs-on: ubuntu-latest
51+
if: ${{ github.repository == 'kubeflow/trainer' }}
5052
env:
5153
GOPATH: ${{ github.workspace }}/go
5254
defaults:

charts/kubeflow-trainer/crds/trainer.kubeflow.org_trainjobs.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3081,6 +3081,11 @@ spec:
30813081
x-kubernetes-list-type: map
30823082
type: object
30833083
type: object
3084+
x-kubernetes-validations:
3085+
- message: metadata.name must match RFC 1035 DNS label format
3086+
rule: self.metadata.name.matches('^[a-z]([-a-z0-9]*[a-z0-9])?$')
3087+
- message: metadata.name must be no more than 63 characters
3088+
rule: size(self.metadata.name) <= 63
30843089
served: true
30853090
storage: true
30863091
subresources:

charts/kubeflow-trainer/templates/rbac/clusterrole.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,14 @@ rules:
4242
- patch
4343
- update
4444
- watch
45+
- apiGroups:
46+
- ""
47+
resources:
48+
- limitranges
49+
verbs:
50+
- get
51+
- list
52+
- watch
4553
- apiGroups:
4654
- admissionregistration.k8s.io
4755
resources:
@@ -62,6 +70,14 @@ rules:
6270
- patch
6371
- update
6472
- watch
73+
- apiGroups:
74+
- node.k8s.io
75+
resources:
76+
- runtimeclasses
77+
verbs:
78+
- get
79+
- list
80+
- watch
6581
- apiGroups:
6682
- scheduling.x-k8s.io
6783
resources:

cmd/runtimes/deepspeed/Dockerfile

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,11 @@
11
FROM mpioperator/base:v0.6.0 AS mpi
22
FROM nvidia/cuda:12.8.1-devel-ubuntu22.04
33

4-
# Disable interactive dialog from apt.
5-
ENV DEBIAN_FRONTEND noninteractive
6-
7-
# Install libraries required for OpenMPI to work.
8-
RUN apt-get update && apt install -y --no-install-recommends \
9-
cmake g++ gcc \
10-
wget vim \
11-
openssh-client openssh-server libcap2-bin \
12-
libopenmpi-dev openmpi-bin
4+
# Install libraries required for OpenMPI to work. Image installs OpenMPI 5.0.7
5+
RUN apt update && apt install -y --no-install-recommends \
6+
openssh-server openssh-client libcap2-bin \
7+
g++ libopenmpi-dev \
8+
python3-dev pip && rm -f /usr/bin/python && ln -s /usr/bin/python3 /usr/bin/python && rm -rf /var/lib/apt/lists/*
139

1410
# Add capability to run sshd as non-root.
1511
RUN setcap CAP_NET_BIND_SERVICE=+eip /usr/sbin/sshd
@@ -24,10 +20,7 @@ COPY --from=mpi /etc/ssh/ssh_config /etc/ssh/ssh_config
2420
COPY --from=mpi /etc/ssh/sshd_config /etc/ssh/sshd_config
2521
COPY --from=mpi /home/mpiuser/.sshd_config /home/mpiuser/.sshd_config
2622

27-
# Install the required Python packages.
28-
RUN apt install -y python3-dev pip && rm -f /usr/bin/python && ln -s /usr/bin/python3 /usr/bin/python
29-
30-
ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
23+
# Set home directory for mpiuser.
3124
ENV HOME=/home/mpiuser
3225
ENV PATH=$HOME/.local/bin:$PATH
3326

cmd/runtimes/mlx/Dockerfile

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
FROM mpioperator/base:v0.6.0 AS mpi
2-
FROM debian:trixie
2+
FROM nvidia/cuda:12.8.1-devel-ubuntu22.04
33

4-
# Install libraries required for OpenMPI and MLX. This image installs OpenMPI 5.0.7
4+
# Install libraries required for OpenMPI to work. Image installs OpenMPI 5.0.7
55
RUN apt update && apt install -y --no-install-recommends \
66
openssh-server openssh-client libcap2-bin \
7-
libopenmpi-dev \
8-
git g++ libblas-dev liblapack-dev liblapacke-dev
7+
g++ libopenmpi-dev libblas-dev liblapack-dev liblapacke-dev \
8+
python3-dev pip && rm -f /usr/bin/python && ln -s /usr/bin/python3 /usr/bin/python && rm -rf /var/lib/apt/lists/*
99

1010
# Add capability to run sshd as non-root.
1111
RUN setcap CAP_NET_BIND_SERVICE=+eip /usr/sbin/sshd
12+
RUN apt remove libcap2-bin -y
1213

1314
# Configure mpiuser and home directory.
1415
RUN useradd -m mpiuser
@@ -19,12 +20,14 @@ COPY --from=mpi /etc/ssh/ssh_config /etc/ssh/ssh_config
1920
COPY --from=mpi /etc/ssh/sshd_config /etc/ssh/sshd_config
2021
COPY --from=mpi /home/mpiuser/.sshd_config /home/mpiuser/.sshd_config
2122

22-
# Install the required Python packages. This image has Python 3.13
23-
RUN apt update && apt install -y python3 python3-pip && ln -s /usr/bin/python3 /usr/bin/python && apt clean
23+
# Set home directory for mpiuser.
24+
ENV HOME=/home/mpiuser
25+
ENV PATH=$HOME/.local/bin:$PATH
2426

25-
# We have to build MLX and MLX Data from source.
26-
RUN git clone https://github.com/ml-explore/mlx.git
27-
RUN cd mlx && git checkout f018e248cd75dbb65668f418d6afb67842ea28b7 && CMAKE_BUILD_PARALLEL_LEVEL=8 pip install -v --break-system-packages .
27+
COPY cmd/runtimes/mlx/requirements.txt .
28+
RUN pip install --user -r requirements.txt
2829

29-
RUN git clone https://github.com/ml-explore/mlx-data.git
30-
RUN cd mlx-data && git checkout 79516daa75aa3e9fd72fc5e3fb5e9e629912feac && CMAKE_BUILD_PARALLEL_LEVEL=8 pip install -v --break-system-packages .
30+
# Give mpiuser permission to download packages and HF models.
31+
# .cache directory is used by ML frameworks to download models.
32+
RUN chown -R mpiuser:mpiuser /home/mpiuser/.local
33+
RUN mkdir -p /home/mpiuser/.cache && chown -R mpiuser:mpiuser /home/mpiuser/.cache

cmd/runtimes/mlx/requirements.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# MLX libraries.
2+
mlx[cuda]==0.28.0
3+
mlx-data==0.1.0
4+
mlx-lm==0.26.3
5+
# HuggingFace libraries.
6+
datasets==4.0.0

examples/deepspeed/text-summarization/T5-Fine-Tuning.ipynb

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,7 @@
1414
"\n",
1515
"Wikihow dataset: https://huggingface.co/datasets/sentence-transformers/wikihow\n",
1616
"\n",
17-
"This Notebook will use **4 x A100 NVIDIA GPUs**, to fine-tune T5 model on 2 nodes (every node has 2 GPUs).\n",
18-
"\n",
19-
"**TODO (andreyvelich)**: Currently, to run this Notebook you have to manualy update the container resources in the ClusterTrainingRuntime, since we don't propogate TrainJob's `resources_per_node` to the JobSet"
17+
"This Notebook will use **4 x A100 NVIDIA GPUs**, to fine-tune T5 model on 2 nodes (every node has 2 GPUs)."
2018
]
2119
},
2220
{
@@ -295,7 +293,7 @@
295293
"outputs": [],
296294
"source": [
297295
"MODEL_NAME = \"t5-base\"\n",
298-
"BUCKET_NAME = \"TODO: add your bucket here\""
296+
"# BUCKET_NAME = \"TODO: add your bucket here\""
299297
]
300298
},
301299
{

0 commit comments

Comments
 (0)