feat: support for managing gpu enabled self runner infra (kubeflow#2762)

jaiakash · tdn21 · commit c880b0c8ad24 · 2025-09-06T14:12:28.000+05:30
* feat: support for creating and managing gpu cluster Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: makefile bug Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * add: ci action to ask maintainers to add label to when changes are detected Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * chore: fixed issues and cleanup Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: run check on change in pr Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * feat: added seperate workflow for gpu runner Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: deepspeed typo Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * hotfix: add gpu label on PR without merging Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * chore: merged into single action Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fixL run runner as soon as label is added Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: use gpu runner when label exist Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: revert changes and fix script permission Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: create gpu supported gpu Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: nvidia issue Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: gpu cluster and torchtune model Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: notebookpath and delete cluster Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * tmp fix: notebook to use k8s client Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: use akash sdk and fix notenook size Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: notebook error Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: delete cluster before creating one and notebook Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: kube config Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: makefile add comment Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: nvidia runtime Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * hotfix: disable e2e go Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: delete cluster Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: delete cluster Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * hotfix: temporarly use my personal token Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * chore: refactored code Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * hotfix: take hf token from env of self runner vm Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: to run notebook directly Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * refactor: torchtune job Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: ci action Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * fix: pre commit hook Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * chore: rename ci action Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * rem: delete cluster command from makefile Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * chore: rem some steps, fixed wait timing and notebook logs according to kubeflow/sdk#83 Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> * update: upgrade k8s to 1.34.0 Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> --------- Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com> Signed-off-by: Tarun Duhan <itarunduhan@gmail.com>
diff --git a/.github/workflows/test-e2e-gpu.yaml b/.github/workflows/test-e2e-gpu.yaml
@@ -0,0 +1,87 @@
+name: GPU E2E Test
+
+on:
+  pull_request:
+    types: [opened, reopened, synchronize, labeled]
+
+jobs:
+  gpu-e2e-test:
+    name: GPU E2E Test
+    runs-on: oracle-vm-16cpu-a10gpu-240gb
+
+    env:
+      GOPATH: ${{ github.workspace }}/go
+    defaults:
+      run:
+        working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
+
+    strategy:
+      fail-fast: false
+      matrix:
+        kubernetes-version: ["1.34.0"]
+
+    steps:
+      - name: Check GPU label
+        id: check-label
+        run: |
+          if [[ "${{ join(github.event.pull_request.labels.*.name, ',') }}" != *"ok-to-test-gpu-runner"* ]]; then
+            echo "✅ Skipping GPU E2E tests (label not present)."
+            echo "skip=true" >> $GITHUB_OUTPUT
+            exit 0
+          else
+            echo "Label found. Running GPU tests."
+            echo "skip=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Check out code
+        if: steps.check-label.outputs.skip == 'false'
+        uses: actions/checkout@v4
+        with:
+          path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
+
+      - name: Setup Go
+        if: steps.check-label.outputs.skip == 'false'
+        uses: actions/setup-go@v5
+        with:
+          go-version-file: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/go.mod
+
+      - name: Setup Python
+        if: steps.check-label.outputs.skip == 'false'
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.11
+
+      - name: Install dependencies
+        if: steps.check-label.outputs.skip == 'false'
+        run: |
+          pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5
+          pip install git+https://github.com/kubeflow/sdk.git@main
+
+      - name: Setup cluster with GPU support using nvidia/kind
+        if: steps.check-label.outputs.skip == 'false'
+        run: |
+          make test-e2e-setup-gpu-cluster K8S_VERSION=${{ matrix.kubernetes-version }}
+
+      - name: Run e2e test on GPU cluster
+        if: steps.check-label.outputs.skip == 'false'
+        run: |
+          mkdir -p artifacts/notebooks
+          make test-e2e-notebook NOTEBOOK_INPUT=./examples/torchtune/llama3_2/alpaca-trainjob-yaml.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_alpaca-trainjob-yaml.ipynb TIMEOUT=900
+
+      - name: Upload Artifacts to GitHub
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.kubernetes-version }}
+          path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/artifacts/*
+          retention-days: 1
+
+  delete-kind-cluster:
+    name: Delete kind Cluster
+    runs-on: oracle-vm-16cpu-a10gpu-240gb
+    needs: [gpu-e2e-test]
+    if: always()
+    steps:
+      - name: Delete any existing kind cluster
+        run: |
+          sudo kind delete cluster --name kind-gpu && echo "kind cluster has been deleted" || echo "kind cluster doesn't exist"
diff --git a/Makefile b/Makefile
@@ -178,6 +178,10 @@ test-python-integration: ## Run Python integration test.
 test-e2e-setup-cluster: kind ## Setup Kind cluster for e2e test.
 	KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-cluster.sh
 
+.PHONY: test-e2e-setup-gpu-cluster
+test-e2e-setup-gpu-cluster: kind ## Setup Kind cluster for GPU e2e test.
+	KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-gpu-cluster.sh
+
 .PHONY: test-e2e
 test-e2e: ginkgo ## Run Go e2e test.
 	$(GINKGO) -v ./test/e2e/...
diff --git a/examples/torchtune/llama3_2/alpaca-trainjob-yaml.ipynb b/examples/torchtune/llama3_2/alpaca-trainjob-yaml.ipynb
@@ -38,7 +38,9 @@
    "id": "288ec515",
    "metadata": {},
    "outputs": [],
-   "source": "!pip install git+https://github.com/kubeflow/sdk.git@main"
+   "source": [
+    "!pip install git+https://github.com/kubeflow/sdk.git@main"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -73,6 +75,8 @@
    "source": [
     "# List all available Kubeflow Training Runtimes.\n",
     "from kubeflow.trainer import *\n",
+    "from kubeflow_trainer_api import models\n",
+    "import os\n",
     "\n",
     "client = TrainerClient()\n",
     "for runtime in client.list_runtimes():\n",
@@ -154,19 +158,23 @@
    ],
    "source": [
     "# Create a PersistentVolumeClaim for the TorchTune Llama 3.2 1B model.\n",
-    "client.core_api.create_namespaced_persistent_volume_claim(\n",
-    "  namespace=\"default\",\n",
-    "  body=client.V1PersistentVolumeClaim(\n",
-    "    api_version=\"v1\",\n",
-    "    kind=\"PersistentVolumeClaim\",\n",
-    "    metadata=client.V1ObjectMeta(name=\"torchtune-llama3.2-1b\"),\n",
-    "    spec=client.V1PersistentVolumeClaimSpec(\n",
-    "      access_modes=[\"ReadWriteOnce\"],\n",
-    "      resources=client.V1ResourceRequirements(\n",
-    "        requests={\"storage\": \"20Gi\"}\n",
-    "      ),\n",
-    "    ),\n",
-    "  ),\n",
+    "client.backend.core_api.create_namespaced_persistent_volume_claim(\n",
+    "    namespace=\"default\",\n",
+    "    body=models.IoK8sApiCoreV1PersistentVolumeClaim(\n",
+    "        apiVersion=\"v1\",\n",
+    "        kind=\"PersistentVolumeClaim\",\n",
+    "        metadata=models.IoK8sApimachineryPkgApisMetaV1ObjectMeta(\n",
+    "            name=\"torchtune-llama3.2-1b\"\n",
+    "        ),\n",
+    "        spec=models.IoK8sApiCoreV1PersistentVolumeClaimSpec(\n",
+    "            accessModes=[\"ReadWriteOnce\"],\n",
+    "            resources=models.IoK8sApiCoreV1VolumeResourceRequirements(\n",
+    "                requests={\n",
+    "                    \"storage\": models.IoK8sApimachineryPkgApiResourceQuantity(\"200Gi\")\n",
+    "                }\n",
+    "            ),\n",
+    "        ),\n",
+    "    ).to_dict(),\n",
     ")"
    ]
   },
@@ -188,31 +196,51 @@
    "outputs": [],
    "source": [
     "job_name = client.train(\n",
-    "    runtime=Runtime(\n",
-    "        name=\"torchtune-llama3.2-1b\"\n",
-    "    ),\n",
+    "    runtime=client.get_runtime(name=\"torchtune-llama3.2-1b\"),\n",
     "    initializer=Initializer(\n",
     "        dataset=HuggingFaceDatasetInitializer(\n",
     "            storage_uri=\"hf://tatsu-lab/alpaca/data\"\n",
     "        ),\n",
     "        model=HuggingFaceModelInitializer(\n",
     "            storage_uri=\"hf://meta-llama/Llama-3.2-1B-Instruct\",\n",
-    "            access_token=\"<YOUR_HF_TOKEN>\"  # Replace with your Hugging Face token,\n",
+    "            access_token=os.environ[\"HF_TOKEN\"] # Replace with your Hugging Face token,\n",
     "        )\n",
     "    ),\n",
     "    trainer=BuiltinTrainer(\n",
     "        config=TorchTuneConfig(\n",
     "            dataset_preprocess_config=TorchTuneInstructDataset(\n",
-    "                source=DataFormat.PARQUET,\n",
+    "                source=DataFormat.PARQUET, split=\"train[:1000]\"\n",
     "            ),\n",
     "            resources_per_node={\n",
+    "                \"memory\": \"200G\",\n",
     "                \"gpu\": 1,\n",
-    "            }\n",
+    "            },\n",
+    "            \n",
     "        )\n",
     "    )\n",
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "ee5fbe8e",
+   "metadata": {},
+   "source": [
+    "## Wait for running status"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53eaa65a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Wait for the running status.\n",
+    "client.wait_for_job_status(name=job_name, status={\"Running\"})\n"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "75a82b76",
@@ -247,8 +275,8 @@
    "source": [
     "from kubeflow.trainer.constants import constants\n",
     "\n",
-    "log_dict = client.get_job_logs(job_name, follow=False, step=constants.DATASET_INITIALIZER)\n",
-    "print(log_dict[constants.DATASET_INITIALIZER])"
+    "for line in client.get_job_logs(job_name, follow=True, step=constants.DATASET_INITIALIZER):\n",
+    "    print(line)"
    ]
   },
   {
@@ -279,16 +307,16 @@
     }
    ],
    "source": [
-    "log_dict = client.get_job_logs(job_name, follow=False, step=constants.MODEL_INITIALIZER)\n",
-    "print(log_dict[constants.MODEL_INITIALIZER])"
+    "for line in client.get_job_logs(job_name, follow=True, step=constants.MODEL_INITIALIZER):\n",
+    "    print(line)"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "b67775ea",
    "metadata": {},
    "source": [
-    "### Trainer Node"
+    "### Trainer Node "
    ]
   },
   {
@@ -392,8 +420,11 @@
     }
    ],
    "source": [
-    "log_dict = client.get_job_logs(job_name, follow=False)\n",
-    "print(log_dict[f\"{constants.NODE}-0\"])"
+    "for c in client.get_job(name=job_name).steps:\n",
+    "    print(f\"Step: {c.name}, Status: {c.status}, Devices: {c.device} x {c.device_count}\\n\")\n",
+    "\n",
+    "for line in client.get_job_logs(job_name, follow=True):\n",
+    "    print(line)"
    ]
   },
   {
diff --git a/hack/e2e-setup-gpu-cluster.sh b/hack/e2e-setup-gpu-cluster.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This shell is used to setup Kind cluster for Kubeflow Trainer e2e tests.
+
+set -o errexit
+set -o nounset
+set -o pipefail
+set -x
+
+# Configure variables.
+KIND=${KIND:-./bin/kind}
+K8S_VERSION=${K8S_VERSION:-1.32.0}
+GPU_OPERATOR_VERSION="v25.3.2"
+KIND_NODE_VERSION=kindest/node:v${K8S_VERSION}
+GPU_CLUSTER_NAME="kind-gpu"
+NAMESPACE="kubeflow-system"
+TIMEOUT="5m"
+
+# Kubeflow Trainer images.
+# TODO (andreyvelich): Support initializers images.
+CONTROLLER_MANAGER_CI_IMAGE_NAME="ghcr.io/kubeflow/trainer/trainer-controller-manager"
+CONTROLLER_MANAGER_CI_IMAGE_TAG="test"
+CONTROLLER_MANAGER_CI_IMAGE="${CONTROLLER_MANAGER_CI_IMAGE_NAME}:${CONTROLLER_MANAGER_CI_IMAGE_TAG}"
+echo "Build Kubeflow Trainer images"
+sudo docker build . -f cmd/trainer-controller-manager/Dockerfile -t ${CONTROLLER_MANAGER_CI_IMAGE}
+
+# Set up Docker to use NVIDIA runtime.
+sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled
+sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place
+sudo systemctl restart docker
+
+# Create a Kind cluster with GPU support.
+nvkind cluster create --name ${GPU_CLUSTER_NAME} --image "${KIND_NODE_VERSION}"
+nvkind cluster print-gpus
+
+# Install gpu-operator to make sure we can run GPU workloads.
+echo "Install NVIDIA GPU Operator"
+kubectl create ns gpu-operator
+kubectl label --overwrite ns gpu-operator pod-security.kubernetes.io/enforce=privileged
+
+helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update
+
+helm install --wait --generate-name \
+  -n gpu-operator --create-namespace \
+  nvidia/gpu-operator \
+  --version="${GPU_OPERATOR_VERSION}"
+
+# Validation steps for GPU operator installation
+kubectl get ns gpu-operator
+kubectl get ns gpu-operator --show-labels | grep pod-security.kubernetes.io/enforce=privileged
+helm list -n gpu-operator
+kubectl get pods -n gpu-operator -o name | while read pod; do
+  kubectl wait --for=condition=Ready --timeout=300s "$pod" -n gpu-operator || echo "$pod failed to become Ready"
+done
+kubectl get pods -n gpu-operator
+kubectl get nodes -o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu
+
+# Load Kubeflow Trainer images
+echo "Load Kubeflow Trainer images"
+kind load docker-image "${CONTROLLER_MANAGER_CI_IMAGE}" --name "${GPU_CLUSTER_NAME}"
+
+# Deploy Kubeflow Trainer control plane
+echo "Deploy Kubeflow Trainer control plane"
+E2E_MANIFESTS_DIR="artifacts/e2e/manifests"
+mkdir -p "${E2E_MANIFESTS_DIR}"
+cat <<EOF >"${E2E_MANIFESTS_DIR}/kustomization.yaml"
+  apiVersion: kustomize.config.k8s.io/v1beta1
+  kind: Kustomization
+  resources:
+  - ../../../manifests/overlays/manager
+  images:
+  - name: "${CONTROLLER_MANAGER_CI_IMAGE_NAME}"
+    newTag: "${CONTROLLER_MANAGER_CI_IMAGE_TAG}"
+EOF
+
+kubectl apply --server-side -k "${E2E_MANIFESTS_DIR}"
+
+# We should wait until Deployment is in Ready status.
+echo "Wait for Kubeflow Trainer to be ready"
+(kubectl wait deploy/kubeflow-trainer-controller-manager --for=condition=available -n ${NAMESPACE} --timeout ${TIMEOUT} &&
+  kubectl wait pods --for=condition=ready -n ${NAMESPACE} --timeout ${TIMEOUT} --all) ||
+  (
+    echo "Failed to wait until Kubeflow Trainer is ready" &&
+      kubectl get pods -n ${NAMESPACE} &&
+      kubectl describe pods -n ${NAMESPACE} &&
+      exit 1
+  )
+
+print_cluster_info() {
+  kubectl version
+  kubectl cluster-info
+  kubectl get nodes
+  kubectl get pods -n ${NAMESPACE}
+  kubectl describe pod -n ${NAMESPACE}
+}
+
+# TODO (andreyvelich): Currently, we print manager logs due to flaky test.
+echo "Deploy Kubeflow Trainer runtimes"
+kubectl apply --server-side -k manifests/overlays/runtimes || (
+  kubectl logs -n ${NAMESPACE} -l app.kubernetes.io/name=trainer &&
+    print_cluster_info &&
+    exit 1
+)
+
+# TODO (andreyvelich): Discuss how we want to pre-load runtime images to the Kind cluster.
+TORCH_RUNTIME_IMAGE=pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime
+docker pull ${TORCH_RUNTIME_IMAGE}
+kind load docker-image ${TORCH_RUNTIME_IMAGE} --name ${GPU_CLUSTER_NAME}
+
+print_cluster_info