kubeflow · jaiakash · Aug 2, 2025 · Aug 13, 2025 · Aug 13, 2025 · Aug 13, 2025
diff --git a/.github/workflows/test-e2e-gpu.yaml b/.github/workflows/test-e2e-gpu.yaml
@@ -0,0 +1,90 @@
+name: GPU E2E Test
+
+on:
+  pull_request:
+    types: [opened, reopened, synchronize, labeled]
+
+jobs:
+  gpu-e2e-test:
+    name: GPU E2E Test
+    runs-on: self-runner
+
+    env:
+      GOPATH: ${{ github.workspace }}/go
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+    defaults:
+      run:
+        working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
+
+    strategy:
+      fail-fast: false
+      matrix:
+        kubernetes-version: ["1.33.1"]
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+        with:
+          path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
+
+      - name: Check GPU label
+        id: check-label
+        run: |
+          if [[ "${{ join(github.event.pull_request.labels.*.name, ',') }}" != *"ok-to-test-gpu-runner"* ]]; then
+            echo "Label 'ok-to-test-gpu-runner' not found. Skipping GPU tests."
+            echo "skip=true" >> $GITHUB_OUTPUT
+          else
+            echo "Label found. Running GPU tests."
+            echo "skip=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Skip message
+        if: steps.check-label.outputs.skip == 'true'
+        run: echo "✅ Skipped GPU E2E tests (label not present)."
+
+      - name: Setup Go
+        if: steps.check-label.outputs.skip == 'false'
+        uses: actions/setup-go@v5
+        with:
+          go-version-file: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/go.mod
+
+      - name: Setup Python
+        if: steps.check-label.outputs.skip == 'false'
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.11
+
+      - name: Install dependencies
+        if: steps.check-label.outputs.skip == 'false'
+        run: |
+          pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5
+          pip install git+https://github.com/kubeflow/sdk.git@main
+
+      - name: Delete any existing cluster
+        if: steps.check-label.outputs.skip == 'false'
+        run: |
+          make test-e2e-delete-gpu-cluster
+
+      - name: Setup cluster with GPU support using nvidia/kind
+        if: steps.check-label.outputs.skip == 'false'
+        run: |
+          make test-e2e-setup-gpu-cluster K8S_VERSION=${{ matrix.kubernetes-version }}
+
+      - name: Run e2e with Go
+        if: steps.check-label.outputs.skip == 'false'
+        run: |
+          make test-e2e || (kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer && exit 1)
+
+      - name: Run e2e test for torchtrainer notebook on GPU cluster
+        if: steps.check-label.outputs.skip == 'false'
+        run: |
+          mkdir -p artifacts/notebooks
+          make test-e2e-notebook NOTEBOOK_INPUT=./examples/torchtune/llama3_2/alpaca-trainjob-yaml.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_alpaca-trainjob-yaml.ipynb TIMEOUT=900
+
+      - name: Upload Artifacts to GitHub
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.kubernetes-version }}
+          path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/artifacts/*
+          retention-days: 1
diff --git a/Makefile b/Makefile
@@ -178,6 +178,14 @@ test-python-integration: ## Run Python integration test.
 test-e2e-setup-cluster: kind ## Setup Kind cluster for e2e test.
 	KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-cluster.sh
 
+.PHONY: test-e2e-setup-gpu-cluster
+test-e2e-setup-gpu-cluster: kind ## Setup Kind cluster for GPU e2e test.
+	KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-gpu-cluster.sh
+
+.PHONY: test-e2e-delete-cluster
+test-e2e-delete-gpu-cluster: kind ## Delete Kind cluster.
+	KIND=$(KIND) ./hack/e2e-delete-gpu-cluster.sh
+
 .PHONY: test-e2e
 test-e2e: ginkgo ## Run Go e2e test.
 	$(GINKGO) -v ./test/e2e/...

diff --git a/examples/torchtune/llama3_2/alpaca-trainjob-yaml.ipynb b/examples/torchtune/llama3_2/alpaca-trainjob-yaml.ipynb
@@ -38,7 +38,9 @@
    "id": "288ec515",
    "metadata": {},
    "outputs": [],
-   "source": "!pip install git+https://github.com/kubeflow/sdk.git@main"
+   "source": [
+    "!pip install git+https://github.com/kubeflow/sdk.git@main"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -73,6 +75,8 @@
    "source": [
     "# List all available Kubeflow Training Runtimes.\n",
     "from kubeflow.trainer import *\n",
+    "from kubernetes import client as k8s_client\n",
+    "import os\n",
     "\n",
     "client = TrainerClient()\n",
     "for runtime in client.list_runtimes():\n",
@@ -154,16 +158,16 @@
    ],
    "source": [
     "# Create a PersistentVolumeClaim for the TorchTune Llama 3.2 1B model.\n",
-    "client.core_api.create_namespaced_persistent_volume_claim(\n",
+    "client.backend.core_api.create_namespaced_persistent_volume_claim(\n",
     "  namespace=\"default\",\n",
-    "  body=client.V1PersistentVolumeClaim(\n",
+    "  body=k8s_client.V1PersistentVolumeClaim(\n",
     "    api_version=\"v1\",\n",
     "    kind=\"PersistentVolumeClaim\",\n",
-    "    metadata=client.V1ObjectMeta(name=\"torchtune-llama3.2-1b\"),\n",
-    "    spec=client.V1PersistentVolumeClaimSpec(\n",
+    "    metadata=k8s_client.V1ObjectMeta(name=\"torchtune-llama3.2-1b\"),\n",
+    "    spec=k8s_client.V1PersistentVolumeClaimSpec(\n",
     "      access_modes=[\"ReadWriteOnce\"],\n",
-    "      resources=client.V1ResourceRequirements(\n",
-    "        requests={\"storage\": \"20Gi\"}\n",
+    "      resources=k8s_client.V1ResourceRequirements(\n",
+    "        requests={\"storage\": \"200Gi\"}\n",
     "      ),\n",
     "    ),\n",
     "  ),\n",
@@ -188,26 +192,26 @@
    "outputs": [],
    "source": [
     "job_name = client.train(\n",
-    "    runtime=Runtime(\n",
-    "        name=\"torchtune-llama3.2-1b\"\n",
-    "    ),\n",
+    "    runtime=client.get_runtime(name=\"torchtune-llama3.2-1b\"),\n",
     "    initializer=Initializer(\n",
     "        dataset=HuggingFaceDatasetInitializer(\n",
     "            storage_uri=\"hf://tatsu-lab/alpaca/data\"\n",
     "        ),\n",
     "        model=HuggingFaceModelInitializer(\n",
     "            storage_uri=\"hf://meta-llama/Llama-3.2-1B-Instruct\",\n",
-    "            access_token=\"<YOUR_HF_TOKEN>\"  # Replace with your Hugging Face token,\n",
+    "            access_token=os.environ[\"HF_TOKEN\"] # Replace with your Hugging Face token,\n",
     "        )\n",
     "    ),\n",
     "    trainer=BuiltinTrainer(\n",
     "        config=TorchTuneConfig(\n",
     "            dataset_preprocess_config=TorchTuneInstructDataset(\n",
-    "                source=DataFormat.PARQUET,\n",
+    "                source=DataFormat.PARQUET, split=\"train[:1000]\"\n",
     "            ),\n",
     "            resources_per_node={\n",
+    "                \"memory\": \"200G\",\n",
     "                \"gpu\": 1,\n",
-    "            }\n",
+    "            },\n",
+    "            \n",
     "        )\n",
     "    )\n",
     ")"
@@ -248,7 +252,7 @@
     "from kubeflow.trainer.constants import constants\n",
     "\n",
     "log_dict = client.get_job_logs(job_name, follow=False, step=constants.DATASET_INITIALIZER)\n",
-    "print(log_dict[constants.DATASET_INITIALIZER])"
+    "print(log_dict.get(constants.DATASET_INITIALIZER, \"No logs found for dataset initializer.\"))"
    ]
   },
   {
@@ -280,7 +284,7 @@
    ],
    "source": [
     "log_dict = client.get_job_logs(job_name, follow=False, step=constants.MODEL_INITIALIZER)\n",
-    "print(log_dict[constants.MODEL_INITIALIZER])"
+    "print(log_dict.get(constants.MODEL_INITIALIZER, \"No logs found for model initializer.\"))"
    ]
   },
   {
@@ -393,7 +397,7 @@
    ],
    "source": [
     "log_dict = client.get_job_logs(job_name, follow=False)\n",
-    "print(log_dict[f\"{constants.NODE}-0\"])"
+    "print(log_dict.get(f\"{constants.NODE}-0\", \"No logs found for trainer.\"))"
    ]
   },
   {

diff --git a/hack/e2e-delete-gpu-cluster.sh b/hack/e2e-delete-gpu-cluster.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This shell is used to setup Kind cluster for Kubeflow Trainer e2e tests.
+
+set -o errexit
+set -o nounset
+set -o pipefail
+set -x
+
+# Find all clusters with prefix "nvkind"
+CLUSTERS=$(kind get clusters | grep '^nvkind' || true)
+
+if [[ -z "${CLUSTERS}" ]]; then
+  echo "No nvkind clusters found. Nothing to delete."
+  exit 0
+fi
+
+for CLUSTER_NAME in ${CLUSTERS}; do
+  echo "Deleting Kind cluster: ${CLUSTER_NAME}"
+  if kind delete cluster --name "${CLUSTER_NAME}"; then
+    echo "Successfully deleted ${CLUSTER_NAME}"
+  else
+    echo "Warning: Failed to delete ${CLUSTER_NAME}, continuing..."
+  fi
+done
diff --git a/hack/e2e-run-notebook.sh b/hack/e2e-run-notebook.sh
@@ -42,7 +42,7 @@ print_results() {
     kubectl describe trainjob
     kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer
     kubectl logs -l jobset.sigs.k8s.io/replicatedjob-name=trainer-node,batch.kubernetes.io/job-completion-index=0 --tail -1
-    kubectl wait trainjob --for=condition=Complete --all --timeout 3s
+    kubectl wait trainjob --for=condition=Complete --all --timeout 600s
 }
 
 (papermill "${NOTEBOOK_INPUT}" "${NOTEBOOK_OUTPUT}" --execution-timeout "${PAPERMILL_TIMEOUT}" && print_results) ||

diff --git a/hack/e2e-setup-gpu-cluster.sh b/hack/e2e-setup-gpu-cluster.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This shell is used to setup Kind cluster for Kubeflow Trainer e2e tests.
+
+set -o errexit
+set -o nounset
+set -o pipefail
+set -x
+
+# Configure variables.
+KIND=${KIND:-./bin/kind}
+K8S_VERSION=${K8S_VERSION:-1.32.0}
+GPU_OPERATOR_VERSION="v25.3.2"
+KIND_NODE_VERSION=kindest/node:v${K8S_VERSION}
+NAMESPACE="kubeflow-system"
+TIMEOUT="5m"
+
+# Kubeflow Trainer images.
+# TODO (andreyvelich): Support initializers images.
+CONTROLLER_MANAGER_CI_IMAGE_NAME="ghcr.io/kubeflow/trainer/trainer-controller-manager"
+CONTROLLER_MANAGER_CI_IMAGE_TAG="test"
+CONTROLLER_MANAGER_CI_IMAGE="${CONTROLLER_MANAGER_CI_IMAGE_NAME}:${CONTROLLER_MANAGER_CI_IMAGE_TAG}"
+echo "Build Kubeflow Trainer images"
+sudo docker build . -f cmd/trainer-controller-manager/Dockerfile -t ${CONTROLLER_MANAGER_CI_IMAGE}
+
+# Set up Docker to use NVIDIA runtime.
+sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled
+sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place
+sudo systemctl restart docker
+
+# Create a Kind cluster with GPU support.
+nvkind cluster create --image "${KIND_NODE_VERSION}"
+CLUSTER_NAME=$(kind get clusters | grep nvkind)
+nvkind cluster print-gpus
+
+# Install gpu-operator to make sure we can run GPU workloads.
+echo "Install NVIDIA GPU Operator"
+kubectl create ns gpu-operator
+kubectl label --overwrite ns gpu-operator pod-security.kubernetes.io/enforce=privileged
+
+helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update
+
+helm install --wait --generate-name \
+  -n gpu-operator --create-namespace \
+  nvidia/gpu-operator \
+  --version="${GPU_OPERATOR_VERSION}"
+
+# Validation steps for GPU operator installation
+kubectl get ns gpu-operator
+kubectl get ns gpu-operator --show-labels | grep pod-security.kubernetes.io/enforce=privileged
+helm list -n gpu-operator
+kubectl get pods -n gpu-operator -o name | while read pod; do
+  kubectl wait --for=condition=Ready --timeout=300s "$pod" -n gpu-operator || echo "$pod failed to become Ready"
+done
+kubectl get pods -n gpu-operator
+kubectl get nodes -o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu
+
+# Load Kubeflow Trainer images
+echo "Load Kubeflow Trainer images"
+kind load docker-image "${CONTROLLER_MANAGER_CI_IMAGE}" --name "${CLUSTER_NAME}"
+
+# Deploy Kubeflow Trainer control plane
+echo "Deploy Kubeflow Trainer control plane"
+E2E_MANIFESTS_DIR="artifacts/e2e/manifests"
+mkdir -p "${E2E_MANIFESTS_DIR}"
+cat <<EOF >"${E2E_MANIFESTS_DIR}/kustomization.yaml"
+  apiVersion: kustomize.config.k8s.io/v1beta1
+  kind: Kustomization
+  resources:
+  - ../../../manifests/overlays/manager
+  images:
+  - name: "${CONTROLLER_MANAGER_CI_IMAGE_NAME}"
+    newTag: "${CONTROLLER_MANAGER_CI_IMAGE_TAG}"
+EOF
+
+kubectl apply --server-side -k "${E2E_MANIFESTS_DIR}"
+
+# We should wait until Deployment is in Ready status.
+echo "Wait for Kubeflow Trainer to be ready"
+(kubectl wait deploy/kubeflow-trainer-controller-manager --for=condition=available -n ${NAMESPACE} --timeout ${TIMEOUT} &&
+  kubectl wait pods --for=condition=ready -n ${NAMESPACE} --timeout ${TIMEOUT} --all) ||
+  (
+    echo "Failed to wait until Kubeflow Trainer is ready" &&
+      kubectl get pods -n ${NAMESPACE} &&
+      kubectl describe pods -n ${NAMESPACE} &&
+      exit 1
+  )
+
+print_cluster_info() {
+  kubectl version
+  kubectl cluster-info
+  kubectl get nodes
+  kubectl get pods -n ${NAMESPACE}
+  kubectl describe pod -n ${NAMESPACE}
+}
+
+# TODO (andreyvelich): Currently, we print manager logs due to flaky test.
+echo "Deploy Kubeflow Trainer runtimes"
+kubectl apply --server-side -k manifests/overlays/runtimes || (
+  kubectl logs -n ${NAMESPACE} -l app.kubernetes.io/name=trainer &&
+    print_cluster_info &&
+    exit 1
+)
+
+# TODO (andreyvelich): Discuss how we want to pre-load runtime images to the Kind cluster.
+TORCH_RUNTIME_IMAGE=pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime
+docker pull ${TORCH_RUNTIME_IMAGE}
+kind load docker-image ${TORCH_RUNTIME_IMAGE} --name ${CLUSTER_NAME}
+
+print_cluster_info