Skip to content

Commit 7439fc6

Browse files
committed
Merge branch 'master' into issue-2789/implement-cluster-training-runtimes-deprecation-process
2 parents 3b44038 + b9f0602 commit 7439fc6

74 files changed

Lines changed: 7701 additions & 1247 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
name: GPU E2E Test
2+
3+
on:
4+
pull_request:
5+
types: [opened, reopened, synchronize, labeled]
6+
7+
jobs:
8+
gpu-e2e-test:
9+
name: GPU E2E Test
10+
runs-on: oracle-vm-16cpu-a10gpu-240gb
11+
12+
env:
13+
GOPATH: ${{ github.workspace }}/go
14+
defaults:
15+
run:
16+
working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
17+
18+
strategy:
19+
fail-fast: false
20+
matrix:
21+
kubernetes-version: ["1.34.0"]
22+
23+
steps:
24+
- name: Check GPU label
25+
id: check-label
26+
run: |
27+
if [[ "${{ join(github.event.pull_request.labels.*.name, ',') }}" != *"ok-to-test-gpu-runner"* ]]; then
28+
echo "✅ Skipping GPU E2E tests (label not present)."
29+
echo "skip=true" >> $GITHUB_OUTPUT
30+
exit 0
31+
else
32+
echo "Label found. Running GPU tests."
33+
echo "skip=false" >> $GITHUB_OUTPUT
34+
fi
35+
36+
- name: Check out code
37+
if: steps.check-label.outputs.skip == 'false'
38+
uses: actions/checkout@v4
39+
with:
40+
path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
41+
42+
- name: Setup Go
43+
if: steps.check-label.outputs.skip == 'false'
44+
uses: actions/setup-go@v5
45+
with:
46+
go-version-file: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/go.mod
47+
48+
- name: Setup Python
49+
if: steps.check-label.outputs.skip == 'false'
50+
uses: actions/setup-python@v5
51+
with:
52+
python-version: 3.11
53+
54+
- name: Install dependencies
55+
if: steps.check-label.outputs.skip == 'false'
56+
run: |
57+
pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5
58+
pip install git+https://github.com/kubeflow/sdk.git@main
59+
60+
- name: Setup cluster with GPU support using nvidia/kind
61+
if: steps.check-label.outputs.skip == 'false'
62+
run: |
63+
make test-e2e-setup-gpu-cluster K8S_VERSION=${{ matrix.kubernetes-version }}
64+
65+
- name: Run e2e test on GPU cluster
66+
if: steps.check-label.outputs.skip == 'false'
67+
run: |
68+
mkdir -p artifacts/notebooks
69+
make test-e2e-notebook NOTEBOOK_INPUT=./examples/torchtune/llama3_2/alpaca-trainjob-yaml.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_alpaca-trainjob-yaml.ipynb TIMEOUT=900
70+
71+
- name: Upload Artifacts to GitHub
72+
if: always()
73+
uses: actions/upload-artifact@v4
74+
with:
75+
name: ${{ matrix.kubernetes-version }}
76+
path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/artifacts/*
77+
retention-days: 1
78+
79+
delete-kind-cluster:
80+
name: Delete kind Cluster
81+
runs-on: oracle-vm-16cpu-a10gpu-240gb
82+
needs: [gpu-e2e-test]
83+
if: always()
84+
steps:
85+
- name: Delete any existing kind cluster
86+
run: |
87+
sudo kind delete cluster --name kind-gpu && echo "kind cluster has been deleted" || echo "kind cluster doesn't exist"

.github/workflows/test-e2e.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
fail-fast: false
1818
matrix:
1919
# Kubernetes versions for e2e tests on Kind cluster.
20-
kubernetes-version: ["1.30.0", "1.31.0", "1.32.3", "1.33.1"]
20+
kubernetes-version: ["1.31.0", "1.32.3", "1.33.1", "1.34.0"]
2121

2222
steps:
2323
- name: Check out code

Makefile

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ TRAINER_CHART_DIR := $(PROJECT_DIR)/charts/kubeflow-trainer
1818
LOCALBIN ?= $(PROJECT_DIR)/bin
1919

2020
# Tool versions
21-
K8S_VERSION ?= 1.33.0
21+
K8S_VERSION ?= 1.34.0
2222
GINKGO_VERSION ?= $(shell go list -m -f '{{.Version}}' github.com/onsi/ginkgo/v2)
23-
ENVTEST_VERSION ?= release-0.21
23+
ENVTEST_VERSION ?= release-0.22
2424
CONTROLLER_GEN_VERSION ?= v0.18.0
2525
KIND_VERSION ?= $(shell go list -m -f '{{.Version}}' sigs.k8s.io/kind)
2626
HELM_VERSION ?= v3.15.3
@@ -178,6 +178,10 @@ test-python-integration: ## Run Python integration test.
178178
test-e2e-setup-cluster: kind ## Setup Kind cluster for e2e test.
179179
KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-cluster.sh
180180

181+
.PHONY: test-e2e-setup-gpu-cluster
182+
test-e2e-setup-gpu-cluster: kind ## Setup Kind cluster for GPU e2e test.
183+
KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-gpu-cluster.sh
184+
181185
.PHONY: test-e2e
182186
test-e2e: ginkgo ## Run Go e2e test.
183187
$(GINKGO) -v ./test/e2e/...

OWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
approvers:
22
- andreyvelich
3+
- astefanutti
34
- Electronic-Waste
45
- gaocegege
56
- Jeffwan
67
- johnugeorge
78
- tenzen-y
89
- terrytangyuan
910
reviewers:
10-
- astefanutti
1111
- jinchihe
1212
- kuizhiqing
1313
emeritus_approvers:

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
[![Go Report Card](https://goreportcard.com/badge/github.com/kubeflow/trainer)](https://goreportcard.com/report/github.com/kubeflow/trainer)
66
[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10435/badge)](https://www.bestpractices.dev/projects/10435)
77
[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/kubeflow/trainer)
8+
[![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2Fkubeflow%2Ftrainer.svg?type=shield)](https://app.fossa.com/projects/git%2Bgithub.com%2Fkubeflow%2Ftrainer?ref=badge_shield)
89

910
<h1 align="center">
1011
<img src="./docs/images/trainer-logo.svg" alt="logo" width="200">

0 commit comments

Comments
 (0)