@@ -154,19 +154,21 @@ jobs:
154154 # set preserve to true so we can retain the logs
155155 ./scripts/e2e-ci.sh -mp
156156
157+ # HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library.
158+ # Therefore we must disable the upload of the training logs, as they will not exist in the same location.
157159 # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
158160 # and we know that it will be written into a directory created by `mktemp -d`.
159161 # Given this information, we can use the following command to find the file:
160- log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
161- mv "${log_file}" training-log.jsonl
162+ # log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
163+ # mv "${log_file}" training-log.jsonl
162164
163- - name : Upload training logs
164- uses : actions/upload-artifact@v4
165- with :
166- name : training-log.jsonl
167- path : ./instructlab/training-log.jsonl
168- retention-days : 1
169- overwrite : true
165+ # - name: Upload training logs
166+ # uses: actions/upload-artifact@v4
167+ # with:
168+ # name: training-log.jsonl
169+ # path: ./instructlab/training-log.jsonl
170+ # retention-days: 1
171+ # overwrite: true
170172
171173 stop-medium-ec2-runner :
172174 needs :
@@ -195,39 +197,39 @@ jobs:
195197 label : ${{ needs.start-medium-ec2-runner.outputs.label }}
196198 ec2-instance-id : ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}
197199
198- - name : Download loss data
199- id : download-logs
200- uses : actions/download-artifact@v4
201- with :
202- name : training-log.jsonl
203- path : downloaded-data
204-
205- - name : Install dependencies
206- run : |
207- pip install -r requirements-dev.txt
200+ # - name: Download loss data
201+ # id: download-logs
202+ # uses: actions/download-artifact@v4
203+ # with:
204+ # name: training-log.jsonl
205+ # path: downloaded-data
206+
207+ # - name: Install dependencies
208+ # run: |
209+ # pip install -r requirements-dev.txt
208210
209- - name : Try to upload to s3
210- id : upload-s3
211- continue-on-error : true
212- run : |
213- output_file='./test.md'
214- python scripts/create-loss-graph.py \
215- --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
216- --output-file "${output_file}" \
217- --aws-region "${{ vars.AWS_REGION }}" \
218- --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
219- --base-branch "${{ github.event.pull_request.base.ref }}" \
220- --pr-number "${{ github.event.pull_request.number }}" \
221- --head-sha "${{ github.event.pull_request.head.sha }}" \
222- --origin-repository "${{ github.repository }}"
223-
224- cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
225-
226- - name : Check S3 upload status
227- if : steps.upload-s3.outcome == 'failure'
228- run : |
229- echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
230- echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
211+ # - name: Try to upload to s3
212+ # id: upload-s3
213+ # continue-on-error: true
214+ # run: |
215+ # output_file='./test.md'
216+ # python scripts/create-loss-graph.py \
217+ # --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
218+ # --output-file "${output_file}" \
219+ # --aws-region "${{ vars.AWS_REGION }}" \
220+ # --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
221+ # --base-branch "${{ github.event.pull_request.base.ref }}" \
222+ # --pr-number "${{ github.event.pull_request.number }}" \
223+ # --head-sha "${{ github.event.pull_request.head.sha }}" \
224+ # --origin-repository "${{ github.repository }}"
225+
226+ # cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
227+
228+ # - name: Check S3 upload status
229+ # if: steps.upload-s3.outcome == 'failure'
230+ # run: |
231+ # echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
232+ # echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
231233
232234 e2e-medium-workflow-complete :
233235 # we don't want to block PRs on failed EC2 cleanup
0 commit comments