xpu-test #19

Summary
Jobs
- test
Run details
- Usage
- Workflow file

Workflow file for this run

.github/workflows/xpu_test.yml at 8067bd6

	# TODO: this looks sort of similar to _linux-test, but there are like a dozen
	# places where you would have to insert an if statement. Probably it's better to
	# just use a different workflow altogether

	name: xpu-test

	on:
	push:
	tags:
	- ciflow/xpu/*

	permissions:
	id-token: write
	contents: read

	concurrency:
	group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number \|\| github.ref }}
	cancel-in-progress: true

	jobs:
	test:
	# Don't run on forked repos or empty test matrix
	# if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
	timeout-minutes: 60
	runs-on: linux.idc.xpu
	env:
	DOCKER_IMAGE: ci-image:pytorch-linux-jammy-xpu-n-py3
	PYTORCH_RETRY_TEST_CASES: 1
	PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
	XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
	steps:
	- name: Checkout PyTorch
	uses: actions/checkout@v4
	with:
	repository: pytorch/pytorch
	ref: nightly
	path: pytorch
	fetch-depth: 1
	submodules: false

	- name: Checkout Torchao (ao)
	uses: actions/checkout@v4
	with:
	repository: ${{ github.repository }}
	ref: ${{ github.head_ref \|\| github.ref }}
	path: torchao
	fetch-depth: 1
	submodules: recursive

	- name: Clean all stopped docker containers
	if: always()
	shell: bash
	run: \|
	# Prune all stopped containers.
	# If other runner is pruning on this node, will skip.
	nprune=$(ps -ef \| grep -c "docker container prune")
	if [[ $nprune -eq 1 ]]; then
	docker container prune -f
	fi

	- name: Runner health check system info
	if: always()
	shell: bash
	run: \|
	cat /etc/os-release \|\| true
	cat /etc/apt/sources.list.d/oneAPI.list \|\| true
	cat /etc/apt/sources.list.d/intel-gpu-jammy.list \|\| true
	whoami

	- name: Runner health check xpu-smi
	if: always()
	shell: bash
	run: \|
	timeout 30 xpu-smi discovery \|\| true

	- name: Runner health check GPU count
	if: always()
	shell: bash
	run: \|
	ngpu=$(timeout 30 xpu-smi discovery \| grep -c -E 'Device Name' \|\| true)
	msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
	if [[ $ngpu -eq 0 ]]; then
	echo "Error: Failed to detect any GPUs on the runner"
	echo "$msg"
	exit 1
	fi

	- name: Runner diskspace health check
	uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
	if: always()

	- name: Runner health check disconnect on failure
	if: ${{ failure() }}
	shell: bash
	run: \|
	killall runsvc.sh

	- name: Preserve github env variables for use in docker
	shell: bash
	run: \|
	env \| grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
	env \| grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"

	- name: XPU set GPU_FLAG
	shell: bash
	run: \|
	# Add render group for container creation.
	render_gid=`cat /etc/group \| grep render \| cut -d: -f3`
	echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}"

	- name: configure aws credentials
	id: aws_creds
	uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
	with:
	role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
	aws-region: us-east-1

	- name: Login to Amazon ECR
	id: login-ecr
	uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1

	- name: Calculate docker image
	id: calculate-docker-image
	uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
	with:
	docker-image-name: ${{ env.DOCKER_IMAGE }}
	working-directory: pytorch
	repo-name: pytorch

	- name: Use following to pull public copy of the image
	id: print-ghcr-mirror
	env:
	ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
	shell: bash
	run: \|
	tag=${ECR_DOCKER_IMAGE##*:}
	echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

	- name: Pull docker image
	uses: pytorch/test-infra/.github/actions/pull-docker-image@main
	with:
	docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

	- name: Runner health check GPU count
	if: always()
	shell: bash
	run: \|
	ngpu=$(timeout 30 clinfo -l \| grep -c -E 'Device' \|\| true)
	msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
	if [[ $ngpu -eq 0 ]]; then
	echo "Error: Failed to detect any GPUs on the runner"
	echo "$msg"
	exit 1
	fi

	- name: Test
	id: test
	env:
	TEST_COMMAND: torchao/.github/scripts/ci_test_xpu.sh
	DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
	PR_NUMBER: ${{ github.event.pull_request.number }}
	GITHUB_REPOSITORY: ${{ github.repository }}
	GITHUB_WORKFLOW: ${{ github.workflow }}
	GITHUB_JOB: ${{ github.job }}
	GITHUB_RUN_ID: ${{ github.run_id }}
	GITHUB_RUN_NUMBER: ${{ github.run_number }}
	GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
	SHA1: ${{ github.event.pull_request.head.sha \|\| github.sha }}
	timeout-minutes: 60
	run: \|
	set -x

	# detached container should get cleaned up by teardown_ec2_linux
	# Used for GPU_FLAG since that doesn't play nice
	# shellcheck disable=SC2086,SC2090
	container_name=$(docker run \
	${GPU_FLAG:-} \
	-e PR_NUMBER \
	-e GITHUB_ACTIONS \
	-e GITHUB_REPOSITORY \
	-e GITHUB_WORKFLOW \
	-e GITHUB_JOB \
	-e GITHUB_RUN_ID \
	-e GITHUB_RUN_NUMBER \
	-e GITHUB_RUN_ATTEMPT \
	-e JOB_ID \
	-e BRANCH \
	-e SHA1 \
	--user $(id -u):$(id -g) \
	--ulimit stack=10485760:83886080 \
	--ulimit core=0 \
	--security-opt seccomp=unconfined \
	--cap-add=SYS_PTRACE \
	--shm-size="8g" \
	--tty \
	--detach \
	--name="${container_name}" \
	--user jenkins \
	--privileged \
	-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
	-w /var/lib/jenkins/workspace \
	"${DOCKER_IMAGE}"
	)
	# save container name for later step
	echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
	# jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
	docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}"

	- name: Collect backtraces from coredumps (if any)
	if: always()
	run: \|
	# shellcheck disable=SC2156
	find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;

	- name: Stop container before exit
	if: always()
	run: \|
	# Workaround for multiple runners on same IDC node
	docker stop "${{ env.CONTAINER_NAME }}"

	- name: Store Core dumps on GitHub
	uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
	if: failure()
	with:
	name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
	retention-days: 14
	if-no-files-found: ignore
	path: ./*/core.[1-9]

	- name: Teardown XPU
	if: always()
	shell: bash
	run: \|
	# Prune all stopped containers.
	# If other runner is pruning on this node, will skip.
	nprune=$(ps -ef \| grep -c "docker container prune")
	if [[ $nprune -eq 1 ]]; then
	docker container prune -f
	fi

	- name: Runner diskspace health check
	uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
	if: always()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

xpu-test #19

Workflow file

xpu-test #19

Uh oh!

Jobs

Run details

Workflow file for this run