xpu-test #19
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # TODO: this looks sort of similar to _linux-test, but there are like a dozen | |
| # places where you would have to insert an if statement. Probably it's better to | |
| # just use a different workflow altogether | |
| name: xpu-test | |
| on: | |
| push: | |
| tags: | |
| - ciflow/xpu/* | |
| permissions: | |
| id-token: write | |
| contents: read | |
| concurrency: | |
| group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| test: | |
| # Don't run on forked repos or empty test matrix | |
| # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' | |
| timeout-minutes: 60 | |
| runs-on: linux.idc.xpu | |
| env: | |
| DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3 | |
| PYTORCH_RETRY_TEST_CASES: 1 | |
| PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 | |
| XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla | |
| steps: | |
| - name: Checkout PyTorch | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: pytorch/pytorch | |
| ref: nightly | |
| path: pytorch | |
| fetch-depth: 1 | |
| submodules: false | |
| - name: Checkout Torchao (ao) | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: ${{ github.repository }} | |
| ref: ${{ github.head_ref || github.ref }} | |
| path: torchao | |
| fetch-depth: 1 | |
| submodules: recursive | |
| - name: Clean all stopped docker containers | |
| if: always() | |
| shell: bash | |
| run: | | |
| # Prune all stopped containers. | |
| # If other runner is pruning on this node, will skip. | |
| nprune=$(ps -ef | grep -c "docker container prune") | |
| if [[ $nprune -eq 1 ]]; then | |
| docker container prune -f | |
| fi | |
| - name: Runner health check system info | |
| if: always() | |
| shell: bash | |
| run: | | |
| cat /etc/os-release || true | |
| cat /etc/apt/sources.list.d/oneAPI.list || true | |
| cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true | |
| whoami | |
| - name: Runner health check xpu-smi | |
| if: always() | |
| shell: bash | |
| run: | | |
| timeout 30 xpu-smi discovery || true | |
| - name: Runner health check GPU count | |
| if: always() | |
| shell: bash | |
| run: | | |
| ngpu=$(timeout 30 xpu-smi discovery | grep -c -E 'Device Name' || true) | |
| msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified" | |
| if [[ $ngpu -eq 0 ]]; then | |
| echo "Error: Failed to detect any GPUs on the runner" | |
| echo "$msg" | |
| exit 1 | |
| fi | |
| - name: Runner diskspace health check | |
| uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main | |
| if: always() | |
| - name: Runner health check disconnect on failure | |
| if: ${{ failure() }} | |
| shell: bash | |
| run: | | |
| killall runsvc.sh | |
| - name: Preserve github env variables for use in docker | |
| shell: bash | |
| run: | | |
| env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" | |
| env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" | |
| - name: XPU set GPU_FLAG | |
| shell: bash | |
| run: | | |
| # Add render group for container creation. | |
| render_gid=`cat /etc/group | grep render | cut -d: -f3` | |
| echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}" | |
| - name: configure aws credentials | |
| id: aws_creds | |
| uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 | |
| with: | |
| role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only | |
| aws-region: us-east-1 | |
| - name: Login to Amazon ECR | |
| id: login-ecr | |
| uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 | |
| - name: Calculate docker image | |
| id: calculate-docker-image | |
| uses: pytorch/test-infra/.github/actions/calculate-docker-image@main | |
| with: | |
| docker-image-name: ${{ env.DOCKER_IMAGE }} | |
| working-directory: pytorch | |
| repo-name: pytorch | |
| - name: Use following to pull public copy of the image | |
| id: print-ghcr-mirror | |
| env: | |
| ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} | |
| shell: bash | |
| run: | | |
| tag=${ECR_DOCKER_IMAGE##*:} | |
| echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" | |
| - name: Pull docker image | |
| uses: pytorch/test-infra/.github/actions/pull-docker-image@main | |
| with: | |
| docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} | |
| - name: Runner health check GPU count | |
| if: always() | |
| shell: bash | |
| run: | | |
| ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true) | |
| msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified" | |
| if [[ $ngpu -eq 0 ]]; then | |
| echo "Error: Failed to detect any GPUs on the runner" | |
| echo "$msg" | |
| exit 1 | |
| fi | |
| - name: Test | |
| id: test | |
| env: | |
| TEST_COMMAND: torchao/.github/scripts/ci_test_xpu.sh | |
| DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} | |
| PR_NUMBER: ${{ github.event.pull_request.number }} | |
| GITHUB_REPOSITORY: ${{ github.repository }} | |
| GITHUB_WORKFLOW: ${{ github.workflow }} | |
| GITHUB_JOB: ${{ github.job }} | |
| GITHUB_RUN_ID: ${{ github.run_id }} | |
| GITHUB_RUN_NUMBER: ${{ github.run_number }} | |
| GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} | |
| SHA1: ${{ github.event.pull_request.head.sha || github.sha }} | |
| timeout-minutes: 60 | |
| run: | | |
| set -x | |
| # detached container should get cleaned up by teardown_ec2_linux | |
| # Used for GPU_FLAG since that doesn't play nice | |
| # shellcheck disable=SC2086,SC2090 | |
| container_name=$(docker run \ | |
| ${GPU_FLAG:-} \ | |
| -e PR_NUMBER \ | |
| -e GITHUB_ACTIONS \ | |
| -e GITHUB_REPOSITORY \ | |
| -e GITHUB_WORKFLOW \ | |
| -e GITHUB_JOB \ | |
| -e GITHUB_RUN_ID \ | |
| -e GITHUB_RUN_NUMBER \ | |
| -e GITHUB_RUN_ATTEMPT \ | |
| -e JOB_ID \ | |
| -e BRANCH \ | |
| -e SHA1 \ | |
| --user $(id -u):$(id -g) \ | |
| --ulimit stack=10485760:83886080 \ | |
| --ulimit core=0 \ | |
| --security-opt seccomp=unconfined \ | |
| --cap-add=SYS_PTRACE \ | |
| --shm-size="8g" \ | |
| --tty \ | |
| --detach \ | |
| --name="${container_name}" \ | |
| --user jenkins \ | |
| --privileged \ | |
| -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ | |
| -w /var/lib/jenkins/workspace \ | |
| "${DOCKER_IMAGE}" | |
| ) | |
| # save container name for later step | |
| echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV" | |
| # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home | |
| docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}" | |
| - name: Collect backtraces from coredumps (if any) | |
| if: always() | |
| run: | | |
| # shellcheck disable=SC2156 | |
| find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; | |
| - name: Stop container before exit | |
| if: always() | |
| run: | | |
| # Workaround for multiple runners on same IDC node | |
| docker stop "${{ env.CONTAINER_NAME }}" | |
| - name: Store Core dumps on GitHub | |
| uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 | |
| if: failure() | |
| with: | |
| name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} | |
| retention-days: 14 | |
| if-no-files-found: ignore | |
| path: ./**/core.[1-9]* | |
| - name: Teardown XPU | |
| if: always() | |
| shell: bash | |
| run: | | |
| # Prune all stopped containers. | |
| # If other runner is pruning on this node, will skip. | |
| nprune=$(ps -ef | grep -c "docker container prune") | |
| if [[ $nprune -eq 1 ]]; then | |
| docker container prune -f | |
| fi | |
| - name: Runner diskspace health check | |
| uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main | |
| if: always() |