Skip to content

Commit ca56b04

Browse files
committed
chore: refactored code
Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com>
1 parent 27b9c88 commit ca56b04

4 files changed

Lines changed: 9 additions & 9 deletions

File tree

.github/workflows/test-e2e-gpu.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -65,17 +65,17 @@ jobs:
6565
run: |
6666
make test-e2e-delete-gpu-cluster
6767
68-
- name: Setup cluster with GPU
68+
- name: Setup cluster with GPU support using nvidia/kind
6969
if: steps.check-label.outputs.skip == 'false'
7070
run: |
7171
make test-e2e-setup-gpu-cluster K8S_VERSION=${{ matrix.kubernetes-version }}
7272
73-
# - name: Run e2e with Go
74-
# if: steps.check-label.outputs.skip == 'false'
75-
# run: |
76-
# make test-e2e || (kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer && exit 1)
73+
- name: Run e2e with Go
74+
if: steps.check-label.outputs.skip == 'false'
75+
run: |
76+
make test-e2e || (kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer && exit 1)
7777
78-
- name: Run e2e test for example Notebooks
78+
- name: Run e2e test for torchtrainer notebook on GPU cluster
7979
if: steps.check-label.outputs.skip == 'false'
8080
run: |
8181
mkdir -p artifacts/notebooks

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ test-e2e-setup-cluster: kind ## Setup Kind cluster for e2e test.
179179
KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-cluster.sh
180180

181181
.PHONY: test-e2e-setup-gpu-cluster
182-
test-e2e-setup-gpu-cluster: kind ## Setup Kind cluster with GPU e2e test.
182+
test-e2e-setup-gpu-cluster: kind ## Setup Kind cluster for GPU e2e test.
183183
KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-gpu-cluster.sh
184184

185185
.PHONY: test-e2e-delete-cluster

examples/torchtune/llama3_2/alpaca-trainjob-yaml.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@
199199
" ),\n",
200200
" model=HuggingFaceModelInitializer(\n",
201201
" storage_uri=\"hf://meta-llama/Llama-3.2-1B-Instruct\",\n",
202-
" access_token=os.environ[\"HF_TOKEN_AKASH\"] # Replace with your Hugging Face token,\n",
202+
" access_token=os.environ[\"HF_TOKEN\"] # Replace with your Hugging Face token,\n",
203203
" )\n",
204204
" ),\n",
205205
" trainer=BuiltinTrainer(\n",

hack/e2e-run-notebook.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ print_results() {
4242
kubectl describe trainjob
4343
kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer
4444
kubectl logs -l jobset.sigs.k8s.io/replicatedjob-name=trainer-node,batch.kubernetes.io/job-completion-index=0 --tail -1
45-
kubectl wait trainjob --for=condition=Complete --all --timeout 900s
45+
kubectl wait trainjob --for=condition=Complete --all --timeout 600s
4646
}
4747

4848
(papermill "${NOTEBOOK_INPUT}" "${NOTEBOOK_OUTPUT}" --execution-timeout "${PAPERMILL_TIMEOUT}" && print_results) ||

0 commit comments

Comments
 (0)