fix: disable loss exporting for medium training job (#347)

RobotSail · web-flow · commit 64ee152f6b45 · 2024-11-14T12:59:52.000-05:00
The medium training job is not currently running through the training library
and therefore does not emit the same logs. This commit disables the log exporting
logic as it is currently breaking. We intend to re-introduce this logic into CI
once the medium job is aligned with it.

Signed-off-by: Oleg S &lt;97077423+RobotSail@users.noreply.github.com&gt;
diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -154,19 +154,21 @@ jobs:
           # set preserve to true so we can retain the logs
           ./scripts/e2e-ci.sh -mp
           
+          # HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library.
+          #                Therefore we must disable the upload of the training logs, as they will not exist in the same location.
           # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
           # and we know that it will be written into a directory created by `mktemp -d`. 
           # Given this information, we can use the following command to find the file:
-          log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
-          mv "${log_file}" training-log.jsonl
+          # log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
+          # mv "${log_file}" training-log.jsonl
 
-      - name: Upload training logs
-        uses: actions/upload-artifact@v4
-        with:
-          name: training-log.jsonl
-          path: ./instructlab/training-log.jsonl
-          retention-days: 1
-          overwrite: true
+      # - name: Upload training logs
+      #   uses: actions/upload-artifact@v4
+      #   with:
+      #     name: training-log.jsonl
+      #     path: ./instructlab/training-log.jsonl
+      #     retention-days: 1
+      #     overwrite: true
 
   stop-medium-ec2-runner:
     needs:
@@ -195,39 +197,39 @@ jobs:
           label: ${{ needs.start-medium-ec2-runner.outputs.label }}
           ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}
       
-      - name: Download loss data
-        id: download-logs
-        uses: actions/download-artifact@v4
-        with:
-          name: training-log.jsonl
-          path: downloaded-data
-
-      - name: Install dependencies
-        run: |
-          pip install -r requirements-dev.txt
+      # - name: Download loss data
+      #   id: download-logs
+      #   uses: actions/download-artifact@v4
+      #   with:
+      #     name: training-log.jsonl
+      #     path: downloaded-data
+
+      # - name: Install dependencies
+      #   run: |
+      #     pip install -r requirements-dev.txt
       
-      - name: Try to upload to s3
-        id: upload-s3
-        continue-on-error: true
-        run: |
-          output_file='./test.md' 
-          python scripts/create-loss-graph.py  \
-            --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
-            --output-file "${output_file}" \
-            --aws-region "${{ vars.AWS_REGION }}" \
-            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
-            --base-branch "${{ github.event.pull_request.base.ref }}" \
-            --pr-number "${{ github.event.pull_request.number }}" \
-            --head-sha "${{ github.event.pull_request.head.sha }}" \
-            --origin-repository "${{ github.repository }}"
-
-          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
-
-      - name: Check S3 upload status
-        if: steps.upload-s3.outcome == 'failure'
-        run: |
-          echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
-          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
+      # - name: Try to upload to s3
+      #   id: upload-s3
+      #   continue-on-error: true
+      #   run: |
+      #     output_file='./test.md' 
+      #     python scripts/create-loss-graph.py  \
+      #       --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
+      #       --output-file "${output_file}" \
+      #       --aws-region "${{ vars.AWS_REGION }}" \
+      #       --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
+      #       --base-branch "${{ github.event.pull_request.base.ref }}" \
+      #       --pr-number "${{ github.event.pull_request.number }}" \
+      #       --head-sha "${{ github.event.pull_request.head.sha }}" \
+      #       --origin-repository "${{ github.repository }}"
+
+      #     cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
+
+      # - name: Check S3 upload status
+      # if: steps.upload-s3.outcome == 'failure'
+      #   run: |
+      #     echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
+      #     echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
 
   e2e-medium-workflow-complete:
     # we don't want to block PRs on failed EC2 cleanup