Metal backend: Add Whisper to CI workflow (pytorch#15685)

manuelcandales · web-flow · commit c6308a94528f · 2025-11-07T21:21:40.000-05:00
This PR refactors the model export and e2e scripts to support
both CUDA and Metal backends, and updates the Metal CI workflow to
generalize model export and e2e testing for multiple models and quantization
options. It expands Metal CI model coverage to also include Whisper.
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -5,15 +5,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Export model to CUDA format with optional quantization
+# Export model to CUDA/Metal format with optional quantization
 
 show_help() {
   cat << EOF
-Usage: export_model_cuda_artifact.sh <hf_model> [quant_name] [output_dir]
+Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir]
 
-Export a HuggingFace model to CUDA format with optional quantization.
+Export a HuggingFace model to CUDA/Metal format with optional quantization.
 
 Arguments:
+  device       cuda or metal (required)
+
   hf_model     HuggingFace model ID (required)
                Supported models:
                  - mistralai/Voxtral-Mini-3B-2507
@@ -29,9 +31,9 @@ Arguments:
   output_dir   Output directory for artifacts (optional, default: current directory)
 
 Examples:
-  export_model_cuda_artifact.sh "openai/whisper-small"
-  export_model_cuda_artifact.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
-  export_model_cuda_artifact.sh "google/gemma-3-4b-it" "non-quantized" "./output"
+  export_model_artifact.sh metal "openai/whisper-small"
+  export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
+  export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
 EOF
 }
 
@@ -48,9 +50,22 @@ fi
 
 set -eux
 
-HF_MODEL="$1"
-QUANT_NAME="${2:-non-quantized}"
-OUTPUT_DIR="${3:-.}"
+DEVICE="$1"
+HF_MODEL="$2"
+QUANT_NAME="${3:-non-quantized}"
+OUTPUT_DIR="${4:-.}"
+
+case "$DEVICE" in
+  cuda)
+    ;;
+  metal)
+    ;;
+  *)
+    echo "Error: Unsupported device '$DEVICE'"
+    echo "Supported devices: cuda, metal"
+    exit 1
+    ;;
+esac
 
 # Determine model configuration based on HF model ID
 case "$HF_MODEL" in
@@ -75,6 +90,10 @@ case "$HF_MODEL" in
     fi
     ;;
   google/gemma-3-4b-it)
+    if [ "$DEVICE" = "metal" ]; then
+      echo "Error: Export for device 'metal' is not yet tested for model '$HF_MODEL'"
+      exit 1
+    fi
     MODEL_NAME="gemma3"
     TASK="multimodal-text-to-text"
     MAX_SEQ_LEN="64"
@@ -95,9 +114,17 @@ case "$QUANT_NAME" in
     EXTRA_ARGS=""
     ;;
   quantized-int4-tile-packed)
+    if [ "$DEVICE" = "metal" ]; then
+      echo "Error: Metal backend does not yet support quantization '$QUANT_NAME'"
+      exit 1
+    fi
     EXTRA_ARGS="--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
     ;;
   quantized-int4-weight-only)
+    if [ "$DEVICE" = "metal" ]; then
+      echo "Error: Metal backend does not yet support quantization '$QUANT_NAME'"
+      exit 1
+    fi
     EXTRA_ARGS="--qlinear_encoder 4w"
     ;;
   *)
@@ -118,12 +145,18 @@ MAX_SEQ_LEN_ARG=""
 if [ -n "$MAX_SEQ_LEN" ]; then
   MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN"
 fi
+
+DEVICE_ARG=""
+if [ "$DEVICE" = "cuda" ]; then
+  DEVICE_ARG="--device cuda"
+fi
+
 optimum-cli export executorch \
     --model "$HF_MODEL" \
     --task "$TASK" \
-    --recipe "cuda" \
+    --recipe "$DEVICE" \
     --dtype bfloat16 \
-    --device cuda \
+    ${DEVICE_ARG} \
     ${MAX_SEQ_LEN_ARG} \
     ${EXTRA_ARGS} \
     --output_dir ./
@@ -137,18 +170,18 @@ if [ -n "$PREPROCESSOR_OUTPUT" ]; then
 fi
 
 test -f model.pte
-test -f aoti_cuda_blob.ptd
+test -f aoti_${DEVICE}_blob.ptd
 if [ -n "$PREPROCESSOR_OUTPUT" ]; then
   test -f $PREPROCESSOR_OUTPUT
 fi
 echo "::endgroup::"
 
 echo "::group::Store $MODEL_NAME Artifacts"
 mkdir -p "${OUTPUT_DIR}"
-cp model.pte "${OUTPUT_DIR}/"
-cp aoti_cuda_blob.ptd "${OUTPUT_DIR}/"
+mv model.pte "${OUTPUT_DIR}/"
+mv aoti_${DEVICE}_blob.ptd "${OUTPUT_DIR}/"
 if [ -n "$PREPROCESSOR_OUTPUT" ]; then
-  cp $PREPROCESSOR_OUTPUT "${OUTPUT_DIR}/"
+  mv $PREPROCESSOR_OUTPUT "${OUTPUT_DIR}/"
 fi
 ls -al "${OUTPUT_DIR}"
 echo "::endgroup::"
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -5,15 +5,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Test CUDA model end-to-end, need to run .ci/scripts/export_model_cuda_artifact.sh first
+# Test CUDA/Metal model end-to-end, need to run .ci/scripts/export_model_artifact.sh first
 
 show_help() {
   cat << EOF
-Usage: test_model_cuda_e2e.sh <hf_model> <quant_name> [model_dir]
+Usage: test_model_e2e.sh <device> <hf_model> <quant_name> [model_dir]
 
-Build and run end-to-end tests for CUDA models.
+Build and run end-to-end tests for CUDA/Metal models.
 
 Arguments:
+  device      cuda or metal (required)
+
   hf_model    HuggingFace model ID (required)
               Supported models:
                 - mistralai/Voxtral-Mini-3B-2507
@@ -27,12 +29,12 @@ Arguments:
                 - quantized-int4-weight-only
 
   model_dir   Directory containing model artifacts (optional, default: current directory)
-              Expected files: model.pte, aoti_cuda_blob.ptd
+              Expected files: model.pte, aoti_cuda_blob.ptd/aoti_metal_blob.ptd
               Tokenizers and test files will be downloaded to this directory
 
 Examples:
-  test_model_cuda_e2e.sh "openai/whisper-small" "non-quantized"
-  test_model_cuda_e2e.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
+  test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
+  test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
 EOF
 }
 
@@ -55,20 +57,21 @@ fi
 
 set -eux
 
-HF_MODEL="$1"
-QUANT_NAME="$2"
+DEVICE="$1"
+HF_MODEL="$2"
+QUANT_NAME="$3"
 # Download tokenizers, audio, and image files to this directory
-MODEL_DIR="${3:-.}"
+MODEL_DIR="${4:-.}"
 
 echo "Testing model: $HF_MODEL (quantization: $QUANT_NAME)"
 
-# Make sure model.pte and aoti_cuda_blob.ptd exist
+# Make sure model.pte and aoti_${DEVICE}_blob.ptd exist
 if [ ! -f "$MODEL_DIR/model.pte" ]; then
   echo "Error: model.pte not found in $MODEL_DIR"
   exit 1
 fi
-if [ ! -f "$MODEL_DIR/aoti_cuda_blob.ptd" ]; then
-  echo "Error: aoti_cuda_blob.ptd not found in $MODEL_DIR"
+if [ ! -f "$MODEL_DIR/aoti_${DEVICE}_blob.ptd" ]; then
+  echo "Error: aoti_${DEVICE}_blob.ptd not found in $MODEL_DIR"
   exit 1
 fi
 # Locate EXECUTORCH_ROOT from the directory of this script
@@ -152,14 +155,24 @@ ls -al
 echo "::endgroup::"
 
 echo "::group::Build $MODEL_NAME Runner"
+
+if [ "$DEVICE" = "cuda" ]; then
+  BUILD_BACKEND="EXECUTORCH_BUILD_CUDA"
+elif [ "$DEVICE" = "metal" ]; then
+  BUILD_BACKEND="EXECUTORCH_BUILD_METAL"
+else
+  echo "Error: Unsupported device '$DEVICE'. Must be 'cuda' or 'metal'."
+  exit 1
+fi
+
 cmake --preset llm \
-      -DEXECUTORCH_BUILD_CUDA=ON \
+      -D${BUILD_BACKEND}=ON \
       -DCMAKE_INSTALL_PREFIX=cmake-out \
       -DCMAKE_BUILD_TYPE=Release \
       -Bcmake-out -S.
 cmake --build cmake-out -j$(nproc) --target install --config Release
 
-cmake -DEXECUTORCH_BUILD_CUDA=ON \
+cmake -D${BUILD_BACKEND}=ON \
       -DCMAKE_BUILD_TYPE=Release \
       -Sexamples/models/$RUNNER_PATH \
       -Bcmake-out/examples/models/$RUNNER_PATH/
@@ -168,11 +181,13 @@ echo "::endgroup::"
 
 echo "::group::Run $MODEL_NAME Runner"
 set +e
-export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+if [ "$DEVICE" = "cuda" ]; then
+  export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+fi
 
 # Build runner command with common arguments
 RUNNER_BIN="cmake-out/examples/models/$RUNNER_PATH/$RUNNER_TARGET"
-RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd --temperature 0"
+RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --data_path ${MODEL_DIR}/aoti_${DEVICE}_blob.ptd --temperature 0"
 
 # Add model-specific arguments
 case "$MODEL_NAME" in
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -142,7 +142,7 @@ jobs:
         pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
         echo "::endgroup::"
 
-        source .ci/scripts/export_model_cuda_artifact.sh "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
+        source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
 
   benchmark-model-cuda:
     name: benchmark-model-cuda
@@ -249,4 +249,4 @@ jobs:
       download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
-        source .ci/scripts/test_model_cuda_e2e.sh "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
+        source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
diff --git a/.github/workflows/metal.yml b/.github/workflows/metal.yml