Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
ebb17bc
feat: support for creating and managing gpu cluster
jaiakash Aug 2, 2025
fcc01f0
fix: makefile bug
jaiakash Aug 13, 2025
e011c07
add: ci action to ask maintainers to add label to when changes are de…
jaiakash Aug 13, 2025
a32d199
chore: fixed issues and cleanup
jaiakash Aug 13, 2025
6ee2921
fix: run check on change in pr
jaiakash Aug 13, 2025
106e2ff
feat: added seperate workflow for gpu runner
jaiakash Aug 13, 2025
3c3f17d
fix: deepspeed typo
jaiakash Aug 14, 2025
f38cef9
hotfix: add gpu label on PR without merging
jaiakash Aug 14, 2025
b0992ae
chore: merged into single action
jaiakash Aug 27, 2025
ccf9d0d
fixL run runner as soon as label is added
jaiakash Aug 27, 2025
a6195cf
fix: use gpu runner when label exist
jaiakash Aug 27, 2025
dc01280
fix: revert changes and fix script permission
jaiakash Aug 29, 2025
7c9ce64
fix: create gpu supported gpu
jaiakash Aug 29, 2025
44030e9
fix: nvidia issue
jaiakash Aug 29, 2025
5790054
fix: gpu cluster and torchtune model
jaiakash Aug 29, 2025
268768f
fix: notebookpath and delete cluster
jaiakash Aug 29, 2025
d3506e7
tmp fix: notebook to use k8s client
jaiakash Aug 30, 2025
c598d87
fix: use akash sdk and fix notenook size
jaiakash Aug 30, 2025
79c1835
fix: notebook error
jaiakash Aug 30, 2025
9404704
fix: delete cluster before creating one and notebook
jaiakash Aug 30, 2025
578e600
fix: kube config
jaiakash Aug 30, 2025
77247e0
fix: makefile add comment
jaiakash Aug 30, 2025
b70bcf2
fix: nvidia runtime
jaiakash Aug 30, 2025
d2a351e
hotfix: disable e2e go
jaiakash Aug 31, 2025
2063417
fix: delete cluster
jaiakash Aug 31, 2025
f85b1ad
fix: delete cluster
jaiakash Aug 31, 2025
27b9c88
hotfix: temporarly use my personal token
jaiakash Aug 31, 2025
ca56b04
chore: refactored code
jaiakash Aug 31, 2025
19bdefa
use: self runner
jaiakash Aug 31, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions .github/workflows/test-e2e-gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
name: GPU E2E Test

on:
pull_request:
types: [opened, reopened, synchronize, labeled]

jobs:
gpu-e2e-test:
name: GPU E2E Test
runs-on: self-runner

env:
GOPATH: ${{ github.workspace }}/go
HF_TOKEN: ${{ secrets.HF_TOKEN }}
defaults:
run:
working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer

strategy:
fail-fast: false
matrix:
kubernetes-version: ["1.33.1"]

steps:
- name: Check out code
uses: actions/checkout@v4
with:
path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer

- name: Check GPU label
id: check-label
run: |
if [[ "${{ join(github.event.pull_request.labels.*.name, ',') }}" != *"ok-to-test-gpu-runner"* ]]; then
echo "Label 'ok-to-test-gpu-runner' not found. Skipping GPU tests."
echo "skip=true" >> $GITHUB_OUTPUT
else
echo "Label found. Running GPU tests."
echo "skip=false" >> $GITHUB_OUTPUT
fi

- name: Skip message
if: steps.check-label.outputs.skip == 'true'
run: echo "✅ Skipped GPU E2E tests (label not present)."

- name: Setup Go
if: steps.check-label.outputs.skip == 'false'
uses: actions/setup-go@v5
with:
go-version-file: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/go.mod

- name: Setup Python
if: steps.check-label.outputs.skip == 'false'
uses: actions/setup-python@v5
with:
python-version: 3.11

- name: Install dependencies
if: steps.check-label.outputs.skip == 'false'
run: |
pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5
pip install git+https://github.com/kubeflow/sdk.git@main

- name: Delete any existing cluster
if: steps.check-label.outputs.skip == 'false'
run: |
make test-e2e-delete-gpu-cluster

- name: Setup cluster with GPU support using nvidia/kind
if: steps.check-label.outputs.skip == 'false'
run: |
make test-e2e-setup-gpu-cluster K8S_VERSION=${{ matrix.kubernetes-version }}

- name: Run e2e with Go
if: steps.check-label.outputs.skip == 'false'
run: |
make test-e2e || (kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer && exit 1)

- name: Run e2e test for torchtrainer notebook on GPU cluster
if: steps.check-label.outputs.skip == 'false'
run: |
mkdir -p artifacts/notebooks
make test-e2e-notebook NOTEBOOK_INPUT=./examples/torchtune/llama3_2/alpaca-trainjob-yaml.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_alpaca-trainjob-yaml.ipynb TIMEOUT=900

- name: Upload Artifacts to GitHub
if: always()
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.kubernetes-version }}
path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/artifacts/*
retention-days: 1
8 changes: 8 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,14 @@ test-python-integration: ## Run Python integration test.
test-e2e-setup-cluster: kind ## Setup Kind cluster for e2e test.
KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-cluster.sh

.PHONY: test-e2e-setup-gpu-cluster
test-e2e-setup-gpu-cluster: kind ## Setup Kind cluster for GPU e2e test.
KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-gpu-cluster.sh

.PHONY: test-e2e-delete-cluster
test-e2e-delete-gpu-cluster: kind ## Delete Kind cluster.
KIND=$(KIND) ./hack/e2e-delete-gpu-cluster.sh

.PHONY: test-e2e
test-e2e: ginkgo ## Run Go e2e test.
$(GINKGO) -v ./test/e2e/...
Expand Down
36 changes: 20 additions & 16 deletions examples/torchtune/llama3_2/alpaca-trainjob-yaml.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@
"id": "288ec515",
"metadata": {},
"outputs": [],
"source": "!pip install git+https://github.com/kubeflow/sdk.git@main"
"source": [
"!pip install git+https://github.com/kubeflow/sdk.git@main"
]
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -73,6 +75,8 @@
"source": [
"# List all available Kubeflow Training Runtimes.\n",
"from kubeflow.trainer import *\n",
"from kubernetes import client as k8s_client\n",
"import os\n",
"\n",
"client = TrainerClient()\n",
"for runtime in client.list_runtimes():\n",
Expand Down Expand Up @@ -154,16 +158,16 @@
],
"source": [
"# Create a PersistentVolumeClaim for the TorchTune Llama 3.2 1B model.\n",
"client.core_api.create_namespaced_persistent_volume_claim(\n",
"client.backend.core_api.create_namespaced_persistent_volume_claim(\n",
" namespace=\"default\",\n",
" body=client.V1PersistentVolumeClaim(\n",
" body=k8s_client.V1PersistentVolumeClaim(\n",
" api_version=\"v1\",\n",
" kind=\"PersistentVolumeClaim\",\n",
" metadata=client.V1ObjectMeta(name=\"torchtune-llama3.2-1b\"),\n",
" spec=client.V1PersistentVolumeClaimSpec(\n",
" metadata=k8s_client.V1ObjectMeta(name=\"torchtune-llama3.2-1b\"),\n",
" spec=k8s_client.V1PersistentVolumeClaimSpec(\n",
" access_modes=[\"ReadWriteOnce\"],\n",
" resources=client.V1ResourceRequirements(\n",
" requests={\"storage\": \"20Gi\"}\n",
" resources=k8s_client.V1ResourceRequirements(\n",
" requests={\"storage\": \"200Gi\"}\n",
" ),\n",
" ),\n",
" ),\n",
Expand All @@ -188,26 +192,26 @@
"outputs": [],
"source": [
"job_name = client.train(\n",
" runtime=Runtime(\n",
" name=\"torchtune-llama3.2-1b\"\n",
" ),\n",
" runtime=client.get_runtime(name=\"torchtune-llama3.2-1b\"),\n",
" initializer=Initializer(\n",
" dataset=HuggingFaceDatasetInitializer(\n",
" storage_uri=\"hf://tatsu-lab/alpaca/data\"\n",
" ),\n",
" model=HuggingFaceModelInitializer(\n",
" storage_uri=\"hf://meta-llama/Llama-3.2-1B-Instruct\",\n",
" access_token=\"<YOUR_HF_TOKEN>\" # Replace with your Hugging Face token,\n",
" access_token=os.environ[\"HF_TOKEN\"] # Replace with your Hugging Face token,\n",
" )\n",
" ),\n",
" trainer=BuiltinTrainer(\n",
" config=TorchTuneConfig(\n",
" dataset_preprocess_config=TorchTuneInstructDataset(\n",
" source=DataFormat.PARQUET,\n",
" source=DataFormat.PARQUET, split=\"train[:1000]\"\n",
" ),\n",
" resources_per_node={\n",
" \"memory\": \"200G\",\n",
" \"gpu\": 1,\n",
" }\n",
" },\n",
" \n",
" )\n",
" )\n",
")"
Expand Down Expand Up @@ -248,7 +252,7 @@
"from kubeflow.trainer.constants import constants\n",
"\n",
"log_dict = client.get_job_logs(job_name, follow=False, step=constants.DATASET_INITIALIZER)\n",
"print(log_dict[constants.DATASET_INITIALIZER])"
"print(log_dict.get(constants.DATASET_INITIALIZER, \"No logs found for dataset initializer.\"))"
]
},
{
Expand Down Expand Up @@ -280,7 +284,7 @@
],
"source": [
"log_dict = client.get_job_logs(job_name, follow=False, step=constants.MODEL_INITIALIZER)\n",
"print(log_dict[constants.MODEL_INITIALIZER])"
"print(log_dict.get(constants.MODEL_INITIALIZER, \"No logs found for model initializer.\"))"
]
},
{
Expand Down Expand Up @@ -393,7 +397,7 @@
],
"source": [
"log_dict = client.get_job_logs(job_name, follow=False)\n",
"print(log_dict[f\"{constants.NODE}-0\"])"
"print(log_dict.get(f\"{constants.NODE}-0\", \"No logs found for trainer.\"))"
]
},
{
Expand Down
39 changes: 39 additions & 0 deletions hack/e2e-delete-gpu-cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env bash

# Copyright 2025 The Kubeflow Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This shell is used to setup Kind cluster for Kubeflow Trainer e2e tests.

set -o errexit
set -o nounset
set -o pipefail
set -x

# Find all clusters with prefix "nvkind"
CLUSTERS=$(kind get clusters | grep '^nvkind' || true)

if [[ -z "${CLUSTERS}" ]]; then
echo "No nvkind clusters found. Nothing to delete."
exit 0
fi

for CLUSTER_NAME in ${CLUSTERS}; do
echo "Deleting Kind cluster: ${CLUSTER_NAME}"
if kind delete cluster --name "${CLUSTER_NAME}"; then
echo "Successfully deleted ${CLUSTER_NAME}"
else
echo "Warning: Failed to delete ${CLUSTER_NAME}, continuing..."
fi
done
2 changes: 1 addition & 1 deletion hack/e2e-run-notebook.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ print_results() {
kubectl describe trainjob
kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer
kubectl logs -l jobset.sigs.k8s.io/replicatedjob-name=trainer-node,batch.kubernetes.io/job-completion-index=0 --tail -1
kubectl wait trainjob --for=condition=Complete --all --timeout 3s
kubectl wait trainjob --for=condition=Complete --all --timeout 600s
}

(papermill "${NOTEBOOK_INPUT}" "${NOTEBOOK_OUTPUT}" --execution-timeout "${PAPERMILL_TIMEOUT}" && print_results) ||
Expand Down
124 changes: 124 additions & 0 deletions hack/e2e-setup-gpu-cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/usr/bin/env bash

# Copyright 2025 The Kubeflow Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This shell is used to setup Kind cluster for Kubeflow Trainer e2e tests.

set -o errexit
set -o nounset
set -o pipefail
set -x

# Configure variables.
KIND=${KIND:-./bin/kind}
K8S_VERSION=${K8S_VERSION:-1.32.0}
GPU_OPERATOR_VERSION="v25.3.2"
KIND_NODE_VERSION=kindest/node:v${K8S_VERSION}
NAMESPACE="kubeflow-system"
TIMEOUT="5m"

# Kubeflow Trainer images.
# TODO (andreyvelich): Support initializers images.
CONTROLLER_MANAGER_CI_IMAGE_NAME="ghcr.io/kubeflow/trainer/trainer-controller-manager"
CONTROLLER_MANAGER_CI_IMAGE_TAG="test"
CONTROLLER_MANAGER_CI_IMAGE="${CONTROLLER_MANAGER_CI_IMAGE_NAME}:${CONTROLLER_MANAGER_CI_IMAGE_TAG}"
echo "Build Kubeflow Trainer images"
sudo docker build . -f cmd/trainer-controller-manager/Dockerfile -t ${CONTROLLER_MANAGER_CI_IMAGE}

# Set up Docker to use NVIDIA runtime.
sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled
sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place
sudo systemctl restart docker

# Create a Kind cluster with GPU support.
nvkind cluster create --image "${KIND_NODE_VERSION}"
CLUSTER_NAME=$(kind get clusters | grep nvkind)
nvkind cluster print-gpus

# Install gpu-operator to make sure we can run GPU workloads.
echo "Install NVIDIA GPU Operator"
kubectl create ns gpu-operator
kubectl label --overwrite ns gpu-operator pod-security.kubernetes.io/enforce=privileged

helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update

helm install --wait --generate-name \
-n gpu-operator --create-namespace \
nvidia/gpu-operator \
--version="${GPU_OPERATOR_VERSION}"

# Validation steps for GPU operator installation
kubectl get ns gpu-operator
kubectl get ns gpu-operator --show-labels | grep pod-security.kubernetes.io/enforce=privileged
helm list -n gpu-operator
kubectl get pods -n gpu-operator -o name | while read pod; do
kubectl wait --for=condition=Ready --timeout=300s "$pod" -n gpu-operator || echo "$pod failed to become Ready"
done
kubectl get pods -n gpu-operator
kubectl get nodes -o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu

# Load Kubeflow Trainer images
echo "Load Kubeflow Trainer images"
kind load docker-image "${CONTROLLER_MANAGER_CI_IMAGE}" --name "${CLUSTER_NAME}"

# Deploy Kubeflow Trainer control plane
echo "Deploy Kubeflow Trainer control plane"
E2E_MANIFESTS_DIR="artifacts/e2e/manifests"
mkdir -p "${E2E_MANIFESTS_DIR}"
cat <<EOF >"${E2E_MANIFESTS_DIR}/kustomization.yaml"
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../../../manifests/overlays/manager
images:
- name: "${CONTROLLER_MANAGER_CI_IMAGE_NAME}"
newTag: "${CONTROLLER_MANAGER_CI_IMAGE_TAG}"
EOF

kubectl apply --server-side -k "${E2E_MANIFESTS_DIR}"

# We should wait until Deployment is in Ready status.
echo "Wait for Kubeflow Trainer to be ready"
(kubectl wait deploy/kubeflow-trainer-controller-manager --for=condition=available -n ${NAMESPACE} --timeout ${TIMEOUT} &&
kubectl wait pods --for=condition=ready -n ${NAMESPACE} --timeout ${TIMEOUT} --all) ||
(
echo "Failed to wait until Kubeflow Trainer is ready" &&
kubectl get pods -n ${NAMESPACE} &&
kubectl describe pods -n ${NAMESPACE} &&
exit 1
)

print_cluster_info() {
kubectl version
kubectl cluster-info
kubectl get nodes
kubectl get pods -n ${NAMESPACE}
kubectl describe pod -n ${NAMESPACE}
}

# TODO (andreyvelich): Currently, we print manager logs due to flaky test.
echo "Deploy Kubeflow Trainer runtimes"
kubectl apply --server-side -k manifests/overlays/runtimes || (
kubectl logs -n ${NAMESPACE} -l app.kubernetes.io/name=trainer &&
print_cluster_info &&
exit 1
)

# TODO (andreyvelich): Discuss how we want to pre-load runtime images to the Kind cluster.
TORCH_RUNTIME_IMAGE=pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime
docker pull ${TORCH_RUNTIME_IMAGE}
kind load docker-image ${TORCH_RUNTIME_IMAGE} --name ${CLUSTER_NAME}

print_cluster_info
Loading