-
Notifications
You must be signed in to change notification settings - Fork 530
Test torchprime from PyTorch/XLA #9152
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
2a752e2
3a8fdd1
c4d7f56
44f8e43
ad34f5b
96c6541
d607aac
8934753
6c46342
6f84690
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
name: torchprime E2E tests | ||
on: | ||
workflow_call: | ||
inputs: | ||
timeout-minutes: | ||
required: false | ||
type: number | ||
description: Timeout in minutes for the job run | ||
default: 120 | ||
has_code_changes: | ||
required: false | ||
type: string | ||
description: Whether to run full workflow or not | ||
default: 'true' | ||
secrets: | ||
# This is a token for a GitHub user with access to the torchprime repo. | ||
# It is used to trigger the torchprime E2E test workflow. | ||
# The token should be managed in the "Settings > Secrets and variables > Actions" | ||
# section of the repo. | ||
PERSONAL_ACCESS_TOKEN_FOR_TRIGGERING_TORCHPRIME: | ||
required: true | ||
GCLOUD_SERVICE_KEY: | ||
required: true | ||
jobs: | ||
torchprime-e2e-test: | ||
name: Run torchprime E2E tests | ||
timeout-minutes: ${{ inputs.timeout-minutes }} | ||
runs-on: ubuntu-22.04 | ||
steps: | ||
- name: Use Docker in rootless mode | ||
if: inputs.has_code_changes == 'true' | ||
uses: ScribeMD/[email protected] | ||
- name: Add user to docker group | ||
if: inputs.has_code_changes == 'true' | ||
run: | | ||
sudo usermod -aG docker $USER | ||
newgrp docker | ||
shell: bash | ||
# Googlers: if this fails, follow http://shortn/_61iSj31q1b to debug. | ||
- uses: google-github-actions/auth@v2 | ||
if: inputs.has_code_changes == 'true' | ||
with: | ||
credentials_json: '${{ secrets.GCLOUD_SERVICE_KEY }}' | ||
- uses: google-github-actions/setup-gcloud@v2 | ||
if: inputs.has_code_changes == 'true' | ||
with: | ||
version: '>= 363.0.0' | ||
install_components: 'beta,gke-gcloud-auth-plugin' | ||
- name: Verify GCP setup | ||
if: inputs.has_code_changes == 'true' | ||
run: gcloud info | ||
shell: bash | ||
- name: Authenticate Docker | ||
if: inputs.has_code_changes == 'true' | ||
run: gcloud auth configure-docker --quiet | ||
shell: bash | ||
- name: Activate SA credentials | ||
if: inputs.has_code_changes == 'true' | ||
run: gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS | ||
shell: bash | ||
- name: Checkout infra | ||
if: inputs.has_code_changes == 'true' | ||
uses: actions/checkout@v4 | ||
with: | ||
sparse-checkout: | | ||
infra | ||
fetch-depth: 1 | ||
path: pytorch-xla | ||
# Build a docker image for torchprime E2E test | ||
# First download the torch-xla-wheels | ||
- name: Fetch wheels | ||
if: inputs.has_code_changes == 'true' | ||
uses: actions/download-artifact@v4 | ||
with: | ||
name: torch-xla-wheels | ||
path: /tmp/wheels/ | ||
# Generate a random 16-character UUID for the docker tag | ||
- name: Generate random UUID tag | ||
if: inputs.has_code_changes == 'true' | ||
id: random_tag | ||
shell: bash | ||
run: | | ||
echo "uuid=$(openssl rand -hex 8)" >> $GITHUB_OUTPUT | ||
# Then run docker to install them and push a docker | ||
- name: Build and push docker image | ||
if: inputs.has_code_changes == 'true' | ||
shell: bash | ||
working-directory: pytorch-xla | ||
run: | | ||
. ./infra/ansible/build_for_torchprime.sh | ||
env: | ||
DEFAULT_CONTEXT_PATH: /tmp/wheels | ||
DOCKER_IMAGE_NAME: for-torchprime-ci | ||
DOCKER_IMAGE_TAG: ${{ steps.random_tag.outputs.uuid }} | ||
DOCKER_PROJECT: tpu-pytorch | ||
# Trigger torchprime E2E test workflow | ||
- uses: convictional/[email protected] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we document how to debug this workflow and get help if needed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Who provisions for this workflow? Does it have enough capacity for our needs? |
||
if: inputs.has_code_changes == 'true' | ||
with: | ||
owner: AI-Hypercomputer | ||
repo: torchprime | ||
github_token: ${{ secrets.PERSONAL_ACCESS_TOKEN_FOR_TRIGGERING_TORCHPRIME }} | ||
workflow_file_name: e2e_test.yml | ||
wait_interval: 60 | ||
# TODO(yifeit): change this back to `main` when https://github.com/AI-Hypercomputer/torchprime/tree/yifeit/torchprime-ci is merged. | ||
ref: yifeit/torchprime-ci | ||
client_payload: '{"docker_url": "gcr.io/tpu-pytorch/for-torchprime-ci:${{ steps.random_tag.outputs.uuid }}"}' |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# syntax=docker/dockerfile:1.4 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Document the purpose of this dockerfile (e.g. make it clear this is just for torchprime testing, not used by torchprime itself in its normal usage)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider renaming this to |
||
ARG python_version=3.10 | ||
ARG debian_version=bullseye | ||
|
||
FROM python:${python_version}-${debian_version} AS release | ||
|
||
WORKDIR /tmp/wheels | ||
COPY ./*.whl ./ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Document what these wheels are? |
||
|
||
RUN echo "Installing the following wheels" && ls *.whl | ||
RUN pip install *.whl | ||
|
||
# Install the dependencies including libtpu. | ||
WORKDIR /ansible | ||
RUN pip install ansible | ||
COPY --from=ansible . /ansible | ||
|
||
ARG ansible_vars | ||
RUN ansible-playbook -vvv playbook.yaml -e "stage=release" -e "${ansible_vars}" --tags "install_deps" | ||
|
||
WORKDIR / | ||
|
||
RUN rm -rf /ansible /tmp/wheels |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/bin/bash | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider renaming to |
||
|
||
# This script builds and pushes a docker image to be used for torchprime E2E tests. | ||
# | ||
# torchprime is a reference implementation of models using PyTorch/XLA: | ||
# https://github.com/AI-Hypercomputer/torchprime. | ||
# | ||
# The purpose of building a docker image here is to trigger torchprime E2E tests | ||
# from PyTorch/XLA PRs and post-submits. The reason for running torchprime tests | ||
# on PyTorch/XLA changes is to ensure that torchprime models are not broken. | ||
# See https://github.com/AI-Hypercomputer/torchprime/issues/161 for the detailed | ||
# motivation. | ||
|
||
set -ex | ||
|
||
# Check required environment variables | ||
if [ -z "${DEFAULT_CONTEXT_PATH}" ]; then | ||
echo "ERROR: DEFAULT_CONTEXT_PATH is not set" | ||
exit 1 | ||
fi | ||
if [ -z "${DOCKER_IMAGE_NAME}" ]; then | ||
echo "ERROR: DOCKER_IMAGE_NAME is not set" | ||
exit 1 | ||
fi | ||
if [ -z "${DOCKER_IMAGE_TAG}" ]; then | ||
echo "ERROR: DOCKER_IMAGE_TAG is not set" | ||
exit 1 | ||
fi | ||
if [ -z "${DOCKER_PROJECT}" ]; then | ||
echo "ERROR: DOCKER_PROJECT is not set" | ||
exit 1 | ||
fi | ||
|
||
export IMAGE_NAME="gcr.io/${DOCKER_PROJECT}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}" | ||
export DOCKERFILE_PATH="infra/ansible/build_for_torchprime.Dockerfile" | ||
|
||
echo "Building and pushing image: ${IMAGE_NAME}" | ||
|
||
# Define ansible vars used in the docker file by `ansible-playbook`. | ||
# | ||
# See `infra/ansible/playbook.yaml` and `infra/ansible/config/vars.yaml` | ||
# for definition of the variables. | ||
read -r -d '' ANSIBLE_VARS_JSON << EOM || { exit_code=$?; [[ $exit_code -eq 1 ]]; } | ||
{ | ||
"arch": "amd64", | ||
"accelerator": "tpu", | ||
"bundle_libtpu": "0", | ||
"git_versioned_xla_build": true, | ||
"nightly_release": true | ||
} | ||
EOM | ||
ANSIBLE_VARS_COMPACT=$(echo "$ANSIBLE_VARS_JSON" | tr -d '\n' | tr -d ' ') | ||
|
||
docker build -t "${IMAGE_NAME}" \ | ||
--build-context ansible=infra/ansible \ | ||
"${DEFAULT_CONTEXT_PATH}" \ | ||
-f "${DOCKERFILE_PATH}" \ | ||
--build-arg ansible_vars="${ANSIBLE_VARS_COMPACT}" \ | ||
--build-arg python_version=3.10 \ | ||
--build-arg debian_version=bullseye | ||
docker push "${IMAGE_NAME}" | ||
|
||
echo "Successfully pushed image: ${IMAGE_NAME}" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: use a go/ link?