kubeflow
diff --git a/‎.github/workflows/build-and-push-images.yaml‎
Lines changed: 4 additions & 3 deletions b/‎.github/workflows/build-and-push-images.yaml‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎cmd/runtimes/deepspeed/Dockerfile‎
Lines changed: 6 additions & 13 deletions b/‎cmd/runtimes/deepspeed/Dockerfile‎
Lines changed: 6 additions & 13 deletions
diff --git a/‎cmd/runtimes/mlx/Dockerfile‎
Lines changed: 14 additions & 11 deletions b/‎cmd/runtimes/mlx/Dockerfile‎
Lines changed: 14 additions & 11 deletions
diff --git a/‎cmd/runtimes/mlx/requirements.txt‎
Lines changed: 6 additions & 0 deletions b/‎cmd/runtimes/mlx/requirements.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/deepspeed/text-summarization/T5-Fine-Tuning.ipynb‎
Lines changed: 52 additions & 5 deletions b/‎examples/deepspeed/text-summarization/T5-Fine-Tuning.ipynb‎
Lines changed: 52 additions & 5 deletions
@@ -4,9 +4,9 @@ on:
   push:
     branches:
       - master
-      - 'release-*'
+      - "release-*"
     tags:
-      - 'v*'
+      - "v*"
   pull_request:
 
 jobs:
@@ -34,9 +34,10 @@ jobs:
           - component-name: deepspeed-runtime
             dockerfile: cmd/runtimes/deepspeed/Dockerfile
             platforms: linux/amd64,linux/arm64
+          # TODO (andreyvelich): mlx[cuda] doesn't support arm at the moment: https://github.com/ml-explore/mlx/issues/2469
           - component-name: mlx-runtime
             dockerfile: cmd/runtimes/mlx/Dockerfile
-            platforms: linux/arm64
+            platforms: linux/amd64
           - component-name: torchtune-trainer
             dockerfile: cmd/trainers/torchtune/Dockerfile
             platforms: linux/amd64,linux/arm64
 
@@ -1,15 +1,11 @@
 FROM mpioperator/base:v0.6.0 AS mpi
 FROM nvidia/cuda:12.8.1-devel-ubuntu22.04
 
-# Disable interactive dialog from apt.
-ENV DEBIAN_FRONTEND noninteractive
-
-# Install libraries required for OpenMPI to work.
-RUN apt-get update && apt install -y --no-install-recommends \
-    cmake g++ gcc \
-    wget vim \
-    openssh-client openssh-server libcap2-bin \
-    libopenmpi-dev openmpi-bin
+# Install libraries required for OpenMPI to work. Image installs OpenMPI 5.0.7
+RUN apt update && apt install -y --no-install-recommends \
+    openssh-server openssh-client libcap2-bin \
+    g++ libopenmpi-dev \
+    python3-dev pip && rm -f /usr/bin/python && ln -s /usr/bin/python3 /usr/bin/python && rm -rf /var/lib/apt/lists/*
 
 # Add capability to run sshd as non-root.
 RUN setcap CAP_NET_BIND_SERVICE=+eip /usr/sbin/sshd
@@ -24,10 +20,7 @@ COPY --from=mpi /etc/ssh/ssh_config /etc/ssh/ssh_config
 COPY --from=mpi /etc/ssh/sshd_config /etc/ssh/sshd_config
 COPY --from=mpi /home/mpiuser/.sshd_config /home/mpiuser/.sshd_config
 
-# Install the required Python packages.
-RUN apt install -y python3-dev pip && rm -f /usr/bin/python && ln -s /usr/bin/python3 /usr/bin/python
-
-ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+# Set home directory for mpiuser.
 ENV HOME=/home/mpiuser
 ENV PATH=$HOME/.local/bin:$PATH
 
 
@@ -1,14 +1,15 @@
 FROM mpioperator/base:v0.6.0 AS mpi
-FROM debian:trixie
+FROM nvidia/cuda:12.8.1-devel-ubuntu22.04
 
-# Install libraries required for OpenMPI and MLX. This image installs OpenMPI 5.0.7
+# Install libraries required for OpenMPI to work. Image installs OpenMPI 5.0.7
 RUN apt update && apt install -y --no-install-recommends \
     openssh-server openssh-client libcap2-bin \
-    libopenmpi-dev \
-    git g++ libblas-dev liblapack-dev liblapacke-dev
+    g++ libopenmpi-dev libblas-dev liblapack-dev liblapacke-dev \
+    python3-dev pip && rm -f /usr/bin/python && ln -s /usr/bin/python3 /usr/bin/python && rm -rf /var/lib/apt/lists/*
 
 # Add capability to run sshd as non-root.
 RUN setcap CAP_NET_BIND_SERVICE=+eip /usr/sbin/sshd
+RUN apt remove libcap2-bin -y
 
 # Configure mpiuser and home directory.
 RUN useradd -m mpiuser
@@ -19,12 +20,14 @@ COPY --from=mpi /etc/ssh/ssh_config /etc/ssh/ssh_config
 COPY --from=mpi /etc/ssh/sshd_config /etc/ssh/sshd_config
 COPY --from=mpi /home/mpiuser/.sshd_config /home/mpiuser/.sshd_config
 
-# Install the required Python packages. This image has Python 3.13
-RUN apt update && apt install -y python3 python3-pip && ln -s /usr/bin/python3 /usr/bin/python && apt clean
+# Set home directory for mpiuser.
+ENV HOME=/home/mpiuser
+ENV PATH=$HOME/.local/bin:$PATH
 
-# We have to build MLX and MLX Data from source.
-RUN git clone https://github.com/ml-explore/mlx.git
-RUN cd mlx && git checkout f018e248cd75dbb65668f418d6afb67842ea28b7 && CMAKE_BUILD_PARALLEL_LEVEL=8 pip install -v --break-system-packages .
+COPY cmd/runtimes/mlx/requirements.txt .
+RUN pip install --user -r requirements.txt
 
-RUN git clone https://github.com/ml-explore/mlx-data.git
-RUN cd mlx-data && git checkout 79516daa75aa3e9fd72fc5e3fb5e9e629912feac && CMAKE_BUILD_PARALLEL_LEVEL=8 pip install -v --break-system-packages .
+# Give mpiuser permission to download packages and HF models.
+# .cache directory is used by ML frameworks to download models.
+RUN chown -R mpiuser:mpiuser /home/mpiuser/.local
+RUN mkdir -p /home/mpiuser/.cache && chown -R mpiuser:mpiuser /home/mpiuser/.cache
@@ -0,0 +1,6 @@
+# MLX libraries.
+mlx[cuda]==0.28.0
+mlx-data==0.1.0
+mlx-lm==0.26.3
+# HuggingFace libraries.
+datasets==4.0.0
@@ -14,9 +14,7 @@
     "\n",
     "Wikihow dataset: https://huggingface.co/datasets/sentence-transformers/wikihow\n",
     "\n",
-    "This Notebook will use **4 x A100 NVIDIA GPUs**, to fine-tune T5 model on 2 nodes (every node has 2 GPUs).\n",
-    "\n",
-    "**TODO (andreyvelich)**: Currently, to run this Notebook you have to manualy update the container resources in the ClusterTrainingRuntime, since we don't propogate TrainJob's `resources_per_node` to the JobSet"
+    "This Notebook will use **4 x A100 NVIDIA GPUs**, to fine-tune T5 model on 2 nodes (every node has 2 GPUs)."
    ]
   },
   {
@@ -35,7 +33,56 @@
    "id": "4900404c5d532bdf",
    "metadata": {},
    "outputs": [],
-   "source": "# !pip install git+https://github.com/kubeflow/sdk.git@main"
+   "source": [
+    "# !pip install git+https://github.com/kubeflow/sdk.git@main"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "74f50fe7-5a01-468c-9efe-2913b3d251da",
+   "metadata": {},
+   "source": [
+    "## Update the GPU Resources\n",
+    "\n",
+    "Currently, Kubeflow Trainer does not support configuring DeepSpeed resources directly through a\n",
+    "TrainJob specification.\n",
+    "\n",
+    "To adjust GPU allocations (and other container resource settings), you must manually patch the ClusterTrainingRuntime.\n",
+    "\n",
+    "Progress for native resource configuration in TrainJob is being tracked here: [kubeflow/trainer#2650](https://github.com/kubeflow/trainer/issues/2650)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d038d8cd-4e5a-4c4c-aa17-a5c575e2948a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "clustertrainingruntime.trainer.kubeflow.org/deepspeed-distributed patched\n"
+     ]
+    }
+   ],
+   "source": [
+    "patch = \"\"\"\n",
+    "[\n",
+    "  {\n",
+    "    \"op\": \"add\",\n",
+    "    \"path\": \"/spec/template/spec/replicatedJobs/0/template/spec/template/spec/containers/0/resources\",\n",
+    "    \"value\": { \"limits\": { \"nvidia.com/gpu\": \"2\" } }\n",
+    "  },\n",
+    "  {\n",
+    "    \"op\": \"add\",\n",
+    "    \"path\": \"/spec/template/spec/replicatedJobs/1/template/spec/template/spec/containers/0/resources\",\n",
+    "    \"value\": { \"limits\": { \"nvidia.com/gpu\": \"2\" } }\n",
+    "  }\n",
+    "]\n",
+    "\"\"\"\n",
+    "!kubectl patch clustertrainingruntime deepspeed-distributed --type='json' -p \"$patch\""
+   ]
   },
   {
    "cell_type": "markdown",
@@ -293,7 +340,7 @@
    "outputs": [],
    "source": [
     "MODEL_NAME = \"t5-base\"\n",
-    "BUCKET_NAME = \"TODO: add your bucket here\""
+    "# BUCKET_NAME = \"TODO: add your bucket here\""
    ]
   },
   {