Skip to content

xpu-test

xpu-test #19

Workflow file for this run

# TODO: this looks sort of similar to _linux-test, but there are like a dozen
# places where you would have to insert an if statement. Probably it's better to
# just use a different workflow altogether
name: xpu-test
on:
push:
tags:
- ciflow/xpu/*
permissions:
id-token: write
contents: read
concurrency:
group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true
jobs:
test:
# Don't run on forked repos or empty test matrix
# if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
timeout-minutes: 60
runs-on: linux.idc.xpu
env:
DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3
PYTORCH_RETRY_TEST_CASES: 1
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
steps:
- name: Checkout PyTorch
uses: actions/checkout@v4
with:
repository: pytorch/pytorch
ref: nightly
path: pytorch
fetch-depth: 1
submodules: false
- name: Checkout Torchao (ao)
uses: actions/checkout@v4
with:
repository: ${{ github.repository }}
ref: ${{ github.head_ref || github.ref }}
path: torchao
fetch-depth: 1
submodules: recursive
- name: Clean all stopped docker containers
if: always()
shell: bash
run: |
# Prune all stopped containers.
# If other runner is pruning on this node, will skip.
nprune=$(ps -ef | grep -c "docker container prune")
if [[ $nprune -eq 1 ]]; then
docker container prune -f
fi
- name: Runner health check system info
if: always()
shell: bash
run: |
cat /etc/os-release || true
cat /etc/apt/sources.list.d/oneAPI.list || true
cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true
whoami
- name: Runner health check xpu-smi
if: always()
shell: bash
run: |
timeout 30 xpu-smi discovery || true
- name: Runner health check GPU count
if: always()
shell: bash
run: |
ngpu=$(timeout 30 xpu-smi discovery | grep -c -E 'Device Name' || true)
msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
if [[ $ngpu -eq 0 ]]; then
echo "Error: Failed to detect any GPUs on the runner"
echo "$msg"
exit 1
fi
- name: Runner diskspace health check
uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
if: always()
- name: Runner health check disconnect on failure
if: ${{ failure() }}
shell: bash
run: |
killall runsvc.sh
- name: Preserve github env variables for use in docker
shell: bash
run: |
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
- name: XPU set GPU_FLAG
shell: bash
run: |
# Add render group for container creation.
render_gid=`cat /etc/group | grep render | cut -d: -f3`
echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}"
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-image-name: ${{ env.DOCKER_IMAGE }}
working-directory: pytorch
repo-name: pytorch
- name: Use following to pull public copy of the image
id: print-ghcr-mirror
env:
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
shell: bash
run: |
tag=${ECR_DOCKER_IMAGE##*:}
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Runner health check GPU count
if: always()
shell: bash
run: |
ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true)
msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
if [[ $ngpu -eq 0 ]]; then
echo "Error: Failed to detect any GPUs on the runner"
echo "$msg"
exit 1
fi
- name: Test
id: test
env:
TEST_COMMAND: torchao/.github/scripts/ci_test_xpu.sh
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
PR_NUMBER: ${{ github.event.pull_request.number }}
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_WORKFLOW: ${{ github.workflow }}
GITHUB_JOB: ${{ github.job }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
timeout-minutes: 60
run: |
set -x
# detached container should get cleaned up by teardown_ec2_linux
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086,SC2090
container_name=$(docker run \
${GPU_FLAG:-} \
-e PR_NUMBER \
-e GITHUB_ACTIONS \
-e GITHUB_REPOSITORY \
-e GITHUB_WORKFLOW \
-e GITHUB_JOB \
-e GITHUB_RUN_ID \
-e GITHUB_RUN_NUMBER \
-e GITHUB_RUN_ATTEMPT \
-e JOB_ID \
-e BRANCH \
-e SHA1 \
--user $(id -u):$(id -g) \
--ulimit stack=10485760:83886080 \
--ulimit core=0 \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="8g" \
--tty \
--detach \
--name="${container_name}" \
--user jenkins \
--privileged \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
# save container name for later step
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
# jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}"
- name: Collect backtraces from coredumps (if any)
if: always()
run: |
# shellcheck disable=SC2156
find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
- name: Stop container before exit
if: always()
run: |
# Workaround for multiple runners on same IDC node
docker stop "${{ env.CONTAINER_NAME }}"
- name: Store Core dumps on GitHub
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
if: failure()
with:
name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
retention-days: 14
if-no-files-found: ignore
path: ./**/core.[1-9]*
- name: Teardown XPU
if: always()
shell: bash
run: |
# Prune all stopped containers.
# If other runner is pruning on this node, will skip.
nprune=$(ps -ef | grep -c "docker container prune")
if [[ $nprune -eq 1 ]]; then
docker container prune -f
fi
- name: Runner diskspace health check
uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
if: always()