Skip to content

Commit 64ee152

Browse files
authored
fix: disable loss exporting for medium training job (#347)
The medium training job is not currently running through the training library and therefore does not emit the same logs. This commit disables the log exporting logic as it is currently breaking. We intend to re-introduce this logic into CI once the medium job is aligned with it. Signed-off-by: Oleg S <[email protected]>
1 parent 8385f42 commit 64ee152

File tree

1 file changed

+43
-41
lines changed

1 file changed

+43
-41
lines changed

.github/workflows/e2e-nvidia-l4-x1.yml

Lines changed: 43 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -154,19 +154,21 @@ jobs:
154154
# set preserve to true so we can retain the logs
155155
./scripts/e2e-ci.sh -mp
156156
157+
# HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library.
158+
# Therefore we must disable the upload of the training logs, as they will not exist in the same location.
157159
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
158160
# and we know that it will be written into a directory created by `mktemp -d`.
159161
# Given this information, we can use the following command to find the file:
160-
log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
161-
mv "${log_file}" training-log.jsonl
162+
# log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
163+
# mv "${log_file}" training-log.jsonl
162164
163-
- name: Upload training logs
164-
uses: actions/upload-artifact@v4
165-
with:
166-
name: training-log.jsonl
167-
path: ./instructlab/training-log.jsonl
168-
retention-days: 1
169-
overwrite: true
165+
# - name: Upload training logs
166+
# uses: actions/upload-artifact@v4
167+
# with:
168+
# name: training-log.jsonl
169+
# path: ./instructlab/training-log.jsonl
170+
# retention-days: 1
171+
# overwrite: true
170172

171173
stop-medium-ec2-runner:
172174
needs:
@@ -195,39 +197,39 @@ jobs:
195197
label: ${{ needs.start-medium-ec2-runner.outputs.label }}
196198
ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}
197199

198-
- name: Download loss data
199-
id: download-logs
200-
uses: actions/download-artifact@v4
201-
with:
202-
name: training-log.jsonl
203-
path: downloaded-data
204-
205-
- name: Install dependencies
206-
run: |
207-
pip install -r requirements-dev.txt
200+
# - name: Download loss data
201+
# id: download-logs
202+
# uses: actions/download-artifact@v4
203+
# with:
204+
# name: training-log.jsonl
205+
# path: downloaded-data
206+
207+
# - name: Install dependencies
208+
# run: |
209+
# pip install -r requirements-dev.txt
208210

209-
- name: Try to upload to s3
210-
id: upload-s3
211-
continue-on-error: true
212-
run: |
213-
output_file='./test.md'
214-
python scripts/create-loss-graph.py \
215-
--log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
216-
--output-file "${output_file}" \
217-
--aws-region "${{ vars.AWS_REGION }}" \
218-
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
219-
--base-branch "${{ github.event.pull_request.base.ref }}" \
220-
--pr-number "${{ github.event.pull_request.number }}" \
221-
--head-sha "${{ github.event.pull_request.head.sha }}" \
222-
--origin-repository "${{ github.repository }}"
223-
224-
cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
225-
226-
- name: Check S3 upload status
227-
if: steps.upload-s3.outcome == 'failure'
228-
run: |
229-
echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
230-
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
211+
# - name: Try to upload to s3
212+
# id: upload-s3
213+
# continue-on-error: true
214+
# run: |
215+
# output_file='./test.md'
216+
# python scripts/create-loss-graph.py \
217+
# --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
218+
# --output-file "${output_file}" \
219+
# --aws-region "${{ vars.AWS_REGION }}" \
220+
# --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
221+
# --base-branch "${{ github.event.pull_request.base.ref }}" \
222+
# --pr-number "${{ github.event.pull_request.number }}" \
223+
# --head-sha "${{ github.event.pull_request.head.sha }}" \
224+
# --origin-repository "${{ github.repository }}"
225+
226+
# cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
227+
228+
# - name: Check S3 upload status
229+
# if: steps.upload-s3.outcome == 'failure'
230+
# run: |
231+
# echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
232+
# echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
231233

232234
e2e-medium-workflow-complete:
233235
# we don't want to block PRs on failed EC2 cleanup

0 commit comments

Comments
 (0)