|
| 1 | +name: GPU E2E Test |
| 2 | + |
| 3 | +on: |
| 4 | + pull_request: |
| 5 | + types: [opened, reopened, synchronize, labeled] |
| 6 | + |
| 7 | +jobs: |
| 8 | + gpu-e2e-test: |
| 9 | + name: GPU E2E Test |
| 10 | + runs-on: oracle-vm-16cpu-a10gpu-240gb |
| 11 | + |
| 12 | + env: |
| 13 | + GOPATH: ${{ github.workspace }}/go |
| 14 | + defaults: |
| 15 | + run: |
| 16 | + working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer |
| 17 | + |
| 18 | + strategy: |
| 19 | + fail-fast: false |
| 20 | + matrix: |
| 21 | + kubernetes-version: ["1.34.0"] |
| 22 | + |
| 23 | + steps: |
| 24 | + - name: Check GPU label |
| 25 | + id: check-label |
| 26 | + run: | |
| 27 | + if [[ "${{ join(github.event.pull_request.labels.*.name, ',') }}" != *"ok-to-test-gpu-runner"* ]]; then |
| 28 | + echo "✅ Skipping GPU E2E tests (label not present)." |
| 29 | + echo "skip=true" >> $GITHUB_OUTPUT |
| 30 | + exit 0 |
| 31 | + else |
| 32 | + echo "Label found. Running GPU tests." |
| 33 | + echo "skip=false" >> $GITHUB_OUTPUT |
| 34 | + fi |
| 35 | +
|
| 36 | + - name: Check out code |
| 37 | + if: steps.check-label.outputs.skip == 'false' |
| 38 | + uses: actions/checkout@v4 |
| 39 | + with: |
| 40 | + path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer |
| 41 | + |
| 42 | + - name: Setup Go |
| 43 | + if: steps.check-label.outputs.skip == 'false' |
| 44 | + uses: actions/setup-go@v5 |
| 45 | + with: |
| 46 | + go-version-file: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/go.mod |
| 47 | + |
| 48 | + - name: Setup Python |
| 49 | + if: steps.check-label.outputs.skip == 'false' |
| 50 | + uses: actions/setup-python@v5 |
| 51 | + with: |
| 52 | + python-version: 3.11 |
| 53 | + |
| 54 | + - name: Install dependencies |
| 55 | + if: steps.check-label.outputs.skip == 'false' |
| 56 | + run: | |
| 57 | + pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5 |
| 58 | + pip install git+https://github.com/kubeflow/sdk.git@main |
| 59 | +
|
| 60 | + - name: Setup cluster with GPU support using nvidia/kind |
| 61 | + if: steps.check-label.outputs.skip == 'false' |
| 62 | + run: | |
| 63 | + make test-e2e-setup-gpu-cluster K8S_VERSION=${{ matrix.kubernetes-version }} |
| 64 | +
|
| 65 | + - name: Run e2e test on GPU cluster |
| 66 | + if: steps.check-label.outputs.skip == 'false' |
| 67 | + run: | |
| 68 | + mkdir -p artifacts/notebooks |
| 69 | + make test-e2e-notebook NOTEBOOK_INPUT=./examples/torchtune/llama3_2/alpaca-trainjob-yaml.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_alpaca-trainjob-yaml.ipynb TIMEOUT=900 |
| 70 | +
|
| 71 | + - name: Upload Artifacts to GitHub |
| 72 | + if: always() |
| 73 | + uses: actions/upload-artifact@v4 |
| 74 | + with: |
| 75 | + name: ${{ matrix.kubernetes-version }} |
| 76 | + path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/artifacts/* |
| 77 | + retention-days: 1 |
| 78 | + |
| 79 | + delete-kind-cluster: |
| 80 | + name: Delete kind Cluster |
| 81 | + runs-on: oracle-vm-16cpu-a10gpu-240gb |
| 82 | + needs: [gpu-e2e-test] |
| 83 | + if: always() |
| 84 | + steps: |
| 85 | + - name: Delete any existing kind cluster |
| 86 | + run: | |
| 87 | + sudo kind delete cluster --name kind-gpu && echo "kind cluster has been deleted" || echo "kind cluster doesn't exist" |
0 commit comments