File tree Expand file tree Collapse file tree
examples/torchtune/llama3_2 Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -65,17 +65,17 @@ jobs:
6565 run : |
6666 make test-e2e-delete-gpu-cluster
6767
68- - name : Setup cluster with GPU
68+ - name : Setup cluster with GPU support using nvidia/kind
6969 if : steps.check-label.outputs.skip == 'false'
7070 run : |
7171 make test-e2e-setup-gpu-cluster K8S_VERSION=${{ matrix.kubernetes-version }}
7272
73- # - name: Run e2e with Go
74- # if: steps.check-label.outputs.skip == 'false'
75- # run: |
76- # make test-e2e || (kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer && exit 1)
73+ - name : Run e2e with Go
74+ if : steps.check-label.outputs.skip == 'false'
75+ run : |
76+ make test-e2e || (kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer && exit 1)
7777
78- - name : Run e2e test for example Notebooks
78+ - name : Run e2e test for torchtrainer notebook on GPU cluster
7979 if : steps.check-label.outputs.skip == 'false'
8080 run : |
8181 mkdir -p artifacts/notebooks
Original file line number Diff line number Diff line change @@ -179,7 +179,7 @@ test-e2e-setup-cluster: kind ## Setup Kind cluster for e2e test.
179179 KIND=$(KIND ) K8S_VERSION=$(K8S_VERSION ) ./hack/e2e-setup-cluster.sh
180180
181181.PHONY : test-e2e-setup-gpu-cluster
182- test-e2e-setup-gpu-cluster : kind # # Setup Kind cluster with GPU e2e test.
182+ test-e2e-setup-gpu-cluster : kind # # Setup Kind cluster for GPU e2e test.
183183 KIND=$(KIND ) K8S_VERSION=$(K8S_VERSION ) ./hack/e2e-setup-gpu-cluster.sh
184184
185185.PHONY : test-e2e-delete-cluster
Original file line number Diff line number Diff line change 199199 " ),\n " ,
200200 " model=HuggingFaceModelInitializer(\n " ,
201201 " storage_uri=\" hf://meta-llama/Llama-3.2-1B-Instruct\" ,\n " ,
202- " access_token=os.environ[\" HF_TOKEN_AKASH \" ] # Replace with your Hugging Face token,\n " ,
202+ " access_token=os.environ[\" HF_TOKEN \" ] # Replace with your Hugging Face token,\n " ,
203203 " )\n " ,
204204 " ),\n " ,
205205 " trainer=BuiltinTrainer(\n " ,
Original file line number Diff line number Diff line change @@ -42,7 +42,7 @@ print_results() {
4242 kubectl describe trainjob
4343 kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer
4444 kubectl logs -l jobset.sigs.k8s.io/replicatedjob-name=trainer-node,batch.kubernetes.io/job-completion-index=0 --tail -1
45- kubectl wait trainjob --for=condition=Complete --all --timeout 900s
45+ kubectl wait trainjob --for=condition=Complete --all --timeout 600s
4646}
4747
4848(papermill " ${NOTEBOOK_INPUT} " " ${NOTEBOOK_OUTPUT} " --execution-timeout " ${PAPERMILL_TIMEOUT} " && print_results) ||
You can’t perform that action at this time.
0 commit comments