pytorch
diff --git a/‎.ci/scripts/unittest-linux-cmake.sh‎
Lines changed: 2 additions & 0 deletions b/‎.ci/scripts/unittest-linux-cmake.sh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.claude/skills/building/SKILL.md‎
Lines changed: 211 additions & 11 deletions b/‎.claude/skills/building/SKILL.md‎
Lines changed: 211 additions & 11 deletions
diff --git a/‎.github/workflows/build-cadence-runner.yml‎
Lines changed: 35 additions & 0 deletions b/‎.github/workflows/build-cadence-runner.yml‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/arm/MODELS.md‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/MODELS.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/arm/_passes/accumulate_index_put_pass.py‎
Lines changed: 8 additions & 1 deletion b/‎backends/arm/_passes/accumulate_index_put_pass.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 6 additions & 2 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 6 additions & 2 deletions
@@ -19,6 +19,8 @@ if ! python -c "import tosa_serializer" >/dev/null 2>&1; then
     TOSA_SERIALIZATION_DIR="${TOSA_TOOLS_DIR}/serialization"
   fi
 
+  # NOTE: Will be removed when tosa-tools is installed via pypi
+  python -m pip install pybind11==2.10.4
   CMAKE_POLICY_VERSION_MINIMUM=3.5 BUILD_PYBIND=1 \
     python -m pip install --no-dependencies \
     "${TOSA_SERIALIZATION_DIR}"
 
@@ -1,23 +1,223 @@
 ---
 name: building
-description: Build ExecuTorch runners or C++ libraries. Use when compiling runners for Llama, Whisper, or other models, or building the C++ runtime.
+description: Build ExecuTorch from source — Python package, C++ runtime, runners, cross-compilation, and backend-specific builds. Use when compiling anything in the ExecuTorch repo, diagnosing build failures, or setting up platform-specific builds.
 ---
 
-# Building
+# Building ExecuTorch
 
-## Runners (Makefile)
+## Step 1: Ensure Python environment (detect and fix automatically)
+
+**Path A — conda (preferred):**
+```bash
+# Initialize conda for non-interactive shells (required in Claude Code / CI)
+eval "$(conda shell.bash hook 2>/dev/null)"
+
+# Check if executorch conda env exists; create if not
+conda env list 2>/dev/null | grep executorch || \
+  ls "$(conda info --base 2>/dev/null)/envs/" 2>/dev/null | grep executorch || \
+  conda create -yn executorch python=3.12
+
+# Activate
+conda activate executorch
+```
+
+**Path B — no conda (fall back to venv):**
+```bash
+# Find a compatible Python (3.10–3.13). On macOS with only Homebrew Python 3.14+,
+# install a compatible version first: brew install python@3.12
+python3.12 -m venv .executorch-venv   # or python3.11, python3.10, python3.13
+source .executorch-venv/bin/activate
+pip install --upgrade pip
+```
+
+**Then verify (either path):**
+
+Run `python --version` and `cmake --version`. Fix automatically:
+- **Python not 3.10–3.13**: recreate the env with a correct Python version.
+- **cmake missing or < 3.24**: run `pip install 'cmake>=3.24'` inside the env.
+- **cmake >= 4.0**: works in practice, no action needed.
+
+Parallel jobs: `$(sysctl -n hw.ncpu)` on macOS, `$(nproc)` on Linux.
+
+## Step 2: Build
+
+Route based on what the user asks for:
+- User mentions **Android** → skip to [Cross-compilation: Android](#cross-compilation)
+- User mentions **iOS** or **frameworks** → skip to [Cross-compilation: iOS](#cross-compilation)
+- User mentions a **model name** (llama, whisper, etc.) → skip to [LLM / ASR model runner](#llm--asr-model-runner-simplest-path-for-running-models)
+- User mentions **C++ runtime** or **cmake** → skip to [C++ runtime](#c-runtime-standalone)
+- Otherwise → default to **Python package** below
+
+### Python package (default)
 ```bash
-make help              # list all targets
-make llama-cpu         # Llama
-make whisper-metal     # Whisper on Metal
-make gemma3-cuda       # Gemma3 on CUDA
+conda activate executorch
+./install_executorch.sh --editable    # editable install from source
 ```
+This handles everything: submodules, deps, C++ build, Python install. Takes ~10 min on Apple Silicon.
+
+For subsequent rebuilds (deps already present): `pip install -e . --no-build-isolation`
+
+For minimal install (skip example deps): `./install_executorch.sh --minimal`
+
+Enable additional backends:
+```bash
+CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON" ./install_executorch.sh --editable
+```
+
+Verify: `python -c "from executorch.exir import to_edge_transform_and_lower; print('OK')"`
+
+### LLM / ASR model runner (simplest path for running models)
+
+```bash
+conda activate executorch
+make <model>-<backend>
+```
+
+Available targets (run `make help` for full list):
+
+| Target | Backend | macOS | Linux |
+|--------|---------|-------|-------|
+| `llama-cpu` | CPU | yes | yes |
+| `llama-cuda` | CUDA | — | yes |
+| `llama-cuda-debug` | CUDA (debug) | — | yes |
+| `llava-cpu` | CPU | yes | yes |
+| `whisper-cpu` | CPU | yes | yes |
+| `whisper-metal` | Metal | yes | — |
+| `whisper-cuda` | CUDA | — | yes |
+| `parakeet-cpu` | CPU | yes | yes |
+| `parakeet-metal` | Metal | yes | — |
+| `parakeet-cuda` | CUDA | — | yes |
+| `voxtral-cpu` | CPU | yes | yes |
+| `voxtral-cuda` | CUDA | — | yes |
+| `voxtral-metal` | Metal | yes | — |
+| `voxtral_realtime-cpu` | CPU | yes | yes |
+| `voxtral_realtime-cuda` | CUDA | — | yes |
+| `voxtral_realtime-metal` | Metal | yes | — |
+| `gemma3-cpu` | CPU | yes | yes |
+| `gemma3-cuda` | CUDA | — | yes |
+| `sortformer-cpu` | CPU | yes | yes |
+| `sortformer-cuda` | CUDA | — | yes |
+| `silero-vad-cpu` | CPU | yes | yes |
+| `clean` | — | yes | yes |
 
 Output: `cmake-out/examples/models/<model>/<runner>`
 
-## C++ Libraries (CMake)
+### C++ runtime (standalone)
+
+**With presets (recommended):**
+
+| Platform | Command |
+|----------|---------|
+| macOS | `cmake -B cmake-out --preset macos` (uses Xcode generator — requires Xcode) |
+| Linux | `cmake -B cmake-out --preset linux -DCMAKE_BUILD_TYPE=Release` |
+| Windows | `cmake -B cmake-out --preset windows -T ClangCL` |
+
+Then: `cmake --build cmake-out --config Release -j$(sysctl -n hw.ncpu)` (macOS) or `cmake --build cmake-out -j$(nproc)` (Linux)
+
+**LLM libraries via workflow presets** (configure + build + install in one command):
+```bash
+cmake --workflow --preset llm-release        # CPU
+cmake --workflow --preset llm-release-metal  # Metal (macOS)
+cmake --workflow --preset llm-release-cuda   # CUDA (Linux/Windows)
+```
+
+**Manual CMake (custom flags):**
+```bash
+cmake -B cmake-out \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DEXECUTORCH_BUILD_XNNPACK=ON \
+  -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON
+cmake --build cmake-out --parallel "$(nproc 2>/dev/null || sysctl -n hw.ncpu)"
+```
+
+Run `cmake --list-presets` to see all available presets.
+
+### Cross-compilation
+
+**iOS/macOS frameworks:**
+```bash
+./scripts/build_apple_frameworks.sh --coreml --mps --xnnpack
+```
+Link in Xcode with `-all_load` linker flag.
+
+**Android:**
+
+Requires `ANDROID_NDK` on PATH (typically set by Android Studio or standalone NDK install).
 ```bash
-cmake --list-presets                    # list presets
-cmake --workflow --preset llm-release   # LLM CPU
-cmake --workflow --preset llm-release-metal  # LLM Metal
+# Verify NDK is available
+echo $ANDROID_NDK           # must point to NDK root, e.g. ~/Library/Android/sdk/ndk/<version>
+export ANDROID_ABIS=arm64-v8a BUILD_AAR_DIR=aar-out
+mkdir -p $BUILD_AAR_DIR && sh scripts/build_android_library.sh
 ```
+
+## Key build options
+
+Most commonly needed flags (full list: `CMakeLists.txt`):
+
+| Flag | What it enables |
+|------|-----------------|
+| `EXECUTORCH_BUILD_XNNPACK` | XNNPACK CPU backend |
+| `EXECUTORCH_BUILD_COREML` | Core ML (macOS/iOS) |
+| `EXECUTORCH_BUILD_MPS` | MPS GPU (macOS/iOS) |
+| `EXECUTORCH_BUILD_METAL` | Metal compute (macOS, requires EXTENSION_TENSOR) |
+| `EXECUTORCH_BUILD_CUDA` | CUDA GPU (Linux/Windows, requires EXTENSION_TENSOR) |
+| `EXECUTORCH_BUILD_KERNELS_OPTIMIZED` | Optimized kernels |
+| `EXECUTORCH_BUILD_KERNELS_QUANTIZED` | Quantized kernels |
+| `EXECUTORCH_BUILD_EXTENSION_MODULE` | Module extension (requires DATA_LOADER + FLAT_TENSOR + NAMED_DATA_MAP) |
+| `EXECUTORCH_BUILD_EXTENSION_LLM` | LLM extension |
+| `EXECUTORCH_BUILD_TESTS` | Unit tests (`ctest --test-dir cmake-out --output-on-failure`) |
+| `EXECUTORCH_BUILD_DEVTOOLS` | DevTools (Inspector, ETDump) |
+| `EXECUTORCH_OPTIMIZE_SIZE` | Size-optimized build (`-Os`, no exceptions/RTTI) |
+| `CMAKE_BUILD_TYPE` | `Release` or `Debug` (5-10x slower). Some presets (e.g. `llm-release`) set this; others require it explicitly. |
+
+## Troubleshooting
+
+| Symptom | Fix |
+|---------|-----|
+| Missing headers / `CMakeLists.txt not found` in third-party | `git submodule sync --recursive && git submodule update --init --recursive` |
+| Mysterious failures after `git pull` or branch switch | `rm -rf cmake-out/ pip-out/ && git submodule sync && git submodule update --init --recursive` |
+| `conda env list` PermissionError | Use `CONDA_NO_PLUGINS=true conda env list` or check env dir directly |
+| CMake >= 4.0 | Works in practice despite `< 4.0` in docs; only fix if build actually fails |
+| `externally-managed-environment` / PEP 668 error | You're using system Python, not conda. Activate conda env first. |
+| pip conflicts with torch versions | Fresh conda env; or `./install_executorch.sh --use-pt-pinned-commit` |
+| Missing `Python.h` (Linux) | `sudo apt install python3.X-dev` |
+| Missing operator registrations at runtime | Link kernel libs with `-Wl,-force_load,<lib>` (macOS) or `-Wl,--whole-archive <lib> -Wl,--no-whole-archive` (Linux) |
+| `install_executorch.sh` fails on Intel Mac | No prebuilt PyTorch wheels; use `--use-pt-pinned-commit --minimal` |
+| XNNPACK build errors about cpuinfo/pthreadpool | Ensure `EXECUTORCH_BUILD_CPUINFO=ON` and `EXECUTORCH_BUILD_PTHREADPOOL=ON` (both ON by default) |
+| Duplicate kernel registration abort | Only link one `gen_operators_lib` per target |
+
+## Build output
+
+**From `./install_executorch.sh` (Python package):**
+
+| Artifact | Location |
+|----------|----------|
+| Python package | `site-packages/executorch` |
+
+**From CMake builds** (`cmake --install` with `CMAKE_INSTALL_PREFIX=cmake-out`):
+
+| Artifact | Location |
+|----------|----------|
+| Core runtime | `cmake-out/lib/libexecutorch.a` |
+| XNNPACK backend | `cmake-out/lib/libxnnpack_backend.a` |
+| executor_runner | `cmake-out/executor_runner` (Ninja/Make) or `cmake-out/Release/executor_runner` (Xcode) |
+| Model runners | `cmake-out/examples/models/<model>/<runner>` |
+
+**From cross-compilation:**
+
+| Artifact | Location |
+|----------|----------|
+| iOS frameworks | `cmake-out/*.xcframework` |
+| Android AAR | `aar-out/` |
+
+## Tips
+- Always use `Release` for benchmarking; `Debug` is 5–10x slower
+- `ccache` is auto-detected if installed (`brew install ccache`)
+- `Ninja` is faster than Make (`-G Ninja`) — but `--preset macos` uses Xcode generator
+- For LLM workflows, `make <model>-<backend>` is the simplest path
+- After `git pull`, clean and re-init submodules before rebuilding
@@ -0,0 +1,35 @@
+name: Build Cadence
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+      - release/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  cpu-x86:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      job-name: build
+      runner: linux.2xlarge
+      docker-image: ci-image:executorch-ubuntu-22.04-clang12
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        set -eux
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        ./install_requirements.sh > /dev/null
+        bash backends/cadence/build_cadence_runner.sh
@@ -1057,7 +1057,8 @@ jobs:
 
   test-samsung-quantmodels-linux:
     name: test-samsung-quantmodels-linux
-    # if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
+    # Skip this job if the pull request is from a fork (secrets are not available)
+    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -1094,7 +1095,8 @@ jobs:
 
   test-samsung-models-linux:
     name: test-samsung-models-linux
-    # if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
+    # Skip this job if the pull request is from a fork (secrets are not available)
+    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
 
@@ -12,6 +12,7 @@
 - Some popular torch.nn.functional models (NN functional)
 - Some popular torch.nn.modules models (NN modules)
 - Some popular torch ops (Torch Functions)
+- T5 (T5 for conditional generation)
 - Neural Super Sampler (NSS)
 - Phi-3
 - ResNet 18
 
@@ -55,6 +55,9 @@
 from .decompose_index_select_to_gather_pass import (  # noqa
     DecomposeIndexSelectToGatherPass,
 )
+from .decompose_index_tensor_to_gather_pass import (  # noqa
+    DecomposeIndexTensorToGatherPass,
+)
 from .decompose_int16_activation_conv_pass import (  # noqa
     DecomposeConvWithInt16ActivationPass,
 )
@@ -136,6 +139,7 @@
 from .rewrite_le_lt_to_ge_gt_pass import RewriteLeLtToGeGtPass  # noqa
 from .rewrite_matmul import RewriteMatmulPass  # noqa
 from .rewrite_pad import RewritePadPass  # noqa
+from .rewrite_slice import RewriteSlicePass  # noqa
 from .rewrite_upsample import RewriteUpsamplePass  # noqa
 from .scalars_to_attribute_pass import ScalarsToAttributePass  # noqa
 from .size_adjust_input_pass import SizeAdjustInputPass  # noqa
 
@@ -7,6 +7,10 @@
 import torch
 
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.decompose_index_tensor_to_gather_pass import (
+    DecomposeIndexTensorToGatherPass,
+)
+from executorch.backends.arm._passes.rewrite_index_put_pass import RewriteIndexPutPass
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
@@ -33,7 +37,10 @@ class AccumulateIndexPutPass(ArmPass):
     for the index_put op.
     """
 
-    _passes_required_after: Set[Type[ExportPass]] = set()
+    _passes_required_after: Set[Type[ExportPass]] = {
+        DecomposeIndexTensorToGatherPass,
+        RewriteIndexPutPass,
+    }
 
     def call_operator(self, op, args, kwargs, meta):
         if op not in (aten_ops + edge_ops) or not self.allowed_to_transform(meta):
 
@@ -60,6 +60,7 @@
     DecomposeGroupedConvPass,
     DecomposeGroupNormPass,
     DecomposeIndexSelectToGatherPass,
+    DecomposeIndexTensorToGatherPass,
     DecomposeIntPowPass,
     DecomposeLayerNormPass,
     DecomposeLeakyReLUPass,
@@ -121,6 +122,7 @@
     RewriteLeLtToGeGtPass,
     RewriteMatmulPass,
     RewritePadPass,
+    RewriteSlicePass,
     RewriteUpsamplePass,
     ScalarsToAttributePass,
     SizeAdjustInputPass,
@@ -306,6 +308,9 @@ def _tosa_pipeline(
                 DecomposeEmbeddingPass(),
                 DecomposeIndexSelectToGatherPass(),
                 DecomposeStridedSliceCopyPass(),
+                DecomposeSliceScatterPass(),
+                AccumulateIndexPutPass(),
+                DecomposeIndexTensorToGatherPass(),
                 Conv1dUnsqueezePass(),
             ]
         )
@@ -328,8 +333,6 @@ def _tosa_pipeline(
         # Node transformation passes (post scalar-removal)
         self.add_passes(
             [
-                DecomposeSliceScatterPass(),
-                AccumulateIndexPutPass(),
                 RewriteIndexPutPass(),
                 RewriteBoolBitwiseToLogicalPass(),
                 DecomposeRemainderPass(),
@@ -374,6 +377,7 @@ def _tosa_pipeline(
                 RewriteConvPass(exported_program),
                 RewriteMatmulPass(),
                 RewritePadPass(),
+                RewriteSlicePass(),
             ]
         )