pytorch · tengyifei · May 13, 2025 · May 13, 2025 · May 14, 2025 · May 14, 2025
diff --git a/.github/workflows/_torchprime_ci.yml b/.github/workflows/_torchprime_ci.yml
@@ -0,0 +1,107 @@
+name: torchprime E2E tests
+on:
+  workflow_call:
+    inputs:
+      timeout-minutes:
+        required: false
+        type: number
+        description: Timeout in minutes for the job run
+        default: 120
+      has_code_changes:
+        required: false
+        type: string
+        description: Whether to run full workflow or not
+        default: 'true'
+    secrets:
+      # This is a token for a GitHub user with access to the torchprime repo.
+      # It is used to trigger the torchprime E2E test workflow.
+      # The token should be managed in the "Settings > Secrets and variables > Actions"
+      # section of the repo.
+      PERSONAL_ACCESS_TOKEN_FOR_TRIGGERING_TORCHPRIME:
+        required: true
+      GCLOUD_SERVICE_KEY:
+        required: true
+jobs:
+  torchprime-e2e-test:
+    name: Run torchprime E2E tests
+    timeout-minutes: ${{ inputs.timeout-minutes }}
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Use Docker in rootless mode
+        if: inputs.has_code_changes == 'true'
+        uses: ScribeMD/[email protected]
+      - name: Add user to docker group
+        if: inputs.has_code_changes == 'true'
+        run: |
+          sudo usermod -aG docker $USER
+          newgrp docker
+        shell: bash
+      # Googlers: if this fails, follow http://shortn/_61iSj31q1b to debug.
+      - uses: google-github-actions/auth@v2
+        if: inputs.has_code_changes == 'true'
+        with:
+          credentials_json: '${{ secrets.GCLOUD_SERVICE_KEY }}'
+      - uses: google-github-actions/setup-gcloud@v2
+        if: inputs.has_code_changes == 'true'
+        with:
+          version: '>= 363.0.0'
+          install_components: 'beta,gke-gcloud-auth-plugin'
+      - name: Verify GCP setup
+        if: inputs.has_code_changes == 'true'
+        run: gcloud info
+        shell: bash
+      - name: Authenticate Docker
+        if: inputs.has_code_changes == 'true'
+        run: gcloud auth configure-docker --quiet
+        shell: bash
+      - name: Activate SA credentials
+        if: inputs.has_code_changes == 'true'
+        run: gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS
+        shell: bash
+      - name: Checkout infra
+        if: inputs.has_code_changes == 'true'
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: |
+            infra
+          fetch-depth: 1
+          path: pytorch-xla
+      # Build a docker image for torchprime E2E test
+      # First download the torch-xla-wheels
+      - name: Fetch wheels
+        if: inputs.has_code_changes == 'true'
+        uses: actions/download-artifact@v4
+        with:
+          name: torch-xla-wheels
+          path: /tmp/wheels/
+      # Generate a random 16-character UUID for the docker tag
+      - name: Generate random UUID tag
+        if: inputs.has_code_changes == 'true'
+        id: random_tag
+        shell: bash
+        run: |
+          echo "uuid=$(openssl rand -hex 8)" >> $GITHUB_OUTPUT
+      # Then run docker to install them and push a docker
+      - name: Build and push docker image
+        if: inputs.has_code_changes == 'true'
+        shell: bash
+        working-directory: pytorch-xla
+        run: |
+          . ./infra/ansible/build_for_torchprime.sh
+        env:
+          DEFAULT_CONTEXT_PATH: /tmp/wheels
+          DOCKER_IMAGE_NAME: for-torchprime-ci
+          DOCKER_IMAGE_TAG: ${{ steps.random_tag.outputs.uuid }}
+          DOCKER_PROJECT: tpu-pytorch
+      # Trigger torchprime E2E test workflow
+      - uses: convictional/[email protected]
+        if: inputs.has_code_changes == 'true'
+        with:
+          owner: AI-Hypercomputer
+          repo: torchprime
+          github_token: ${{ secrets.PERSONAL_ACCESS_TOKEN_FOR_TRIGGERING_TORCHPRIME }}
+          workflow_file_name: e2e_test.yml
+          wait_interval: 60
+          # TODO(yifeit): change this back to `main` when https://github.com/AI-Hypercomputer/torchprime/tree/yifeit/torchprime-ci is merged.
+          ref: yifeit/torchprime-ci
+          client_payload: '{"docker_url": "gcr.io/tpu-pytorch/for-torchprime-ci:${{ steps.random_tag.outputs.uuid }}"}'
diff --git a/.github/workflows/_tpu_ci.yml b/.github/workflows/_tpu_ci.yml
@@ -2,6 +2,10 @@ name: TPU Integration Test
 on:
   workflow_call:
     inputs:
+      torch-commit:
+        required: false
+        type: string
+        description: torch-commit
       timeout-minutes:
         required: false
         type: number

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -125,6 +125,16 @@ jobs:
       has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }}
     if: github.event_name == 'push' || github.event_name == 'pull_request'
 
+  test-torchprime:
+    name: "torchprime tests"
+    uses: ./.github/workflows/_torchprime_ci.yml
+    needs: [build-torch-xla, check_code_changes]
+    with:
+      timeout-minutes: 100
+      has_code_changes: ${{ needs.check_code_changes.outputs.has_code_changes }}
+    if: github.event_name == 'push' || github.event_name == 'pull_request'
+    secrets: inherit
+
   push-docs:
     name: "Build docs"
     uses: ./.github/workflows/_docs.yml

@@ -0,0 +1,23 @@
+# syntax=docker/dockerfile:1.4
+ARG python_version=3.10
+ARG debian_version=bullseye
+
+FROM python:${python_version}-${debian_version} AS release
+
+WORKDIR /tmp/wheels
+COPY ./*.whl ./
+
+RUN echo "Installing the following wheels" && ls *.whl
+RUN pip install *.whl
+
+# Install the dependencies including libtpu.
+WORKDIR /ansible
+RUN pip install ansible
+COPY --from=ansible . /ansible
+
+ARG ansible_vars
+RUN ansible-playbook -vvv playbook.yaml -e "stage=release" -e "${ansible_vars}" --tags "install_deps"
+
+WORKDIR /
+
+RUN rm -rf /ansible /tmp/wheels
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+# This script builds and pushes a docker image to be used for torchprime E2E tests.
+#
+# torchprime is a reference implementation of models using PyTorch/XLA:
+# https://github.com/AI-Hypercomputer/torchprime.
+#
+# The purpose of building a docker image here is to trigger torchprime E2E tests
+# from PyTorch/XLA PRs and post-submits. The reason for running torchprime tests
+# on PyTorch/XLA changes is to ensure that torchprime models are not broken.
+# See https://github.com/AI-Hypercomputer/torchprime/issues/161 for the detailed
+# motivation.
+
+set -ex
+
+# Check required environment variables
+if [ -z "${DEFAULT_CONTEXT_PATH}" ]; then
+  echo "ERROR: DEFAULT_CONTEXT_PATH is not set"
+  exit 1
+fi
+if [ -z "${DOCKER_IMAGE_NAME}" ]; then
+  echo "ERROR: DOCKER_IMAGE_NAME is not set"
+  exit 1
+fi
+if [ -z "${DOCKER_IMAGE_TAG}" ]; then
+  echo "ERROR: DOCKER_IMAGE_TAG is not set"
+  exit 1
+fi
+if [ -z "${DOCKER_PROJECT}" ]; then
+  echo "ERROR: DOCKER_PROJECT is not set"
+  exit 1
+fi
+
+export IMAGE_NAME="gcr.io/${DOCKER_PROJECT}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}"
+export DOCKERFILE_PATH="infra/ansible/build_for_torchprime.Dockerfile"
+
+echo "Building and pushing image: ${IMAGE_NAME}"
+
+# Define ansible vars used in the docker file by `ansible-playbook`.
+#
+# See `infra/ansible/playbook.yaml` and `infra/ansible/config/vars.yaml`
+# for definition of the variables.
+read -r -d '' ANSIBLE_VARS_JSON << EOM || { exit_code=$?; [[ $exit_code -eq 1 ]]; }
+{
+  "arch": "amd64",
+  "accelerator": "tpu",
+  "bundle_libtpu": "0",
+  "git_versioned_xla_build": true,
+  "nightly_release": true
+}
+EOM
+ANSIBLE_VARS_COMPACT=$(echo "$ANSIBLE_VARS_JSON" | tr -d '\n' | tr -d ' ')
+
+docker build -t "${IMAGE_NAME}" \
+    --build-context ansible=infra/ansible \
+    "${DEFAULT_CONTEXT_PATH}" \
+    -f "${DOCKERFILE_PATH}" \
+    --build-arg ansible_vars="${ANSIBLE_VARS_COMPACT}" \
+    --build-arg python_version=3.10 \
+    --build-arg debian_version=bullseye
+docker push "${IMAGE_NAME}"
+
+echo "Successfully pushed image: ${IMAGE_NAME}"