pytorch
diff --git a/‎.ci/scripts/test_ane_static_llama.sh
Lines changed: 27 additions & 0 deletions b/‎.ci/scripts/test_ane_static_llama.sh
Lines changed: 27 additions & 0 deletions
diff --git a/‎.ci/scripts/test_model.sh
Lines changed: 16 additions & 1 deletion b/‎.ci/scripts/test_model.sh
Lines changed: 16 additions & 1 deletion
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 23 additions & 1 deletion b/‎.github/workflows/trunk.yml
Lines changed: 23 additions & 1 deletion
diff --git a/‎backends/arm/_passes/TARGETS
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/TARGETS
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_utils.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/arm_pass_utils.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/fuse_batchnorm2d_pass.py
Lines changed: 82 additions & 48 deletions b/‎backends/arm/_passes/fuse_batchnorm2d_pass.py
Lines changed: 82 additions & 48 deletions
diff --git a/‎backends/arm/test/models/test_w2l_arm.py
Lines changed: 0 additions & 1 deletion b/‎backends/arm/test/models/test_w2l_arm.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/arm/test/passes/test_fuse_batchnorm_pass.py
Lines changed: 4 additions & 4 deletions b/‎backends/arm/test/passes/test_fuse_batchnorm_pass.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/cadence/aot/pass_utils.py
Lines changed: 10 additions & 0 deletions b/‎backends/cadence/aot/pass_utils.py
Lines changed: 10 additions & 0 deletions
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.."
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+which "${PYTHON_EXECUTABLE}"
+
+pushd $EXECUTORCH_ROOT/examples/apple/coreml/llama
+
+# Download stories llama110m artifacts
+download_stories_model_artifacts
+
+python export.py -n model.pte -p params.json -c stories110M.pt --seq_length 32 --max_seq_length 64 --dtype fp16 --coreml-quantize c4w
+
+popd
@@ -100,6 +100,14 @@ test_model() {
       rm "./${MODEL_NAME}.pte"
       return  # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
   fi
+  if [[ "${MODEL_NAME}" == "phi4_mini" ]]; then
+      # Install requirements for export_llama
+      bash examples/models/llama/install_requirements.sh
+      # Test export_llama script: python3 -m examples.models.llama.export_llama.
+      "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/phi-4-mini/config.json
+      run_portable_executor_runner
+      rm "./${MODEL_NAME}.pte"
+  fi
 
   # Export a basic .pte and run the model.
   "${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export --model_name="${MODEL_NAME}" "${STRICT}"
@@ -164,6 +172,7 @@ test_model_with_qnn() {
   export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
   export PYTHONPATH=$EXECUTORCH_ROOT/..
 
+  EXTRA_FLAGS=""
   if [[ "${MODEL_NAME}" == "dl3" ]]; then
     EXPORT_SCRIPT=deeplab_v3
   elif [[ "${MODEL_NAME}" == "mv3" ]]; then
@@ -176,6 +185,12 @@ test_model_with_qnn() {
     EXPORT_SCRIPT=inception_v3
   elif [[ "${MODEL_NAME}" == "vit" ]]; then
     EXPORT_SCRIPT=torchvision_vit
+  elif [[ "${MODEL_NAME}" == "mb" ]]; then
+    EXPORT_SCRIPT=mobilebert_fine_tune
+    EXTRA_FLAGS="--num_epochs 1"
+    pip install scikit-learn
+  elif [[ "${MODEL_NAME}" == "w2l" ]]; then
+    EXPORT_SCRIPT=wav2letter
   elif [[ "${MODEL_NAME}" == "edsr" ]]; then
     EXPORT_SCRIPT=edsr
     # Additional deps for edsr
@@ -189,7 +204,7 @@ test_model_with_qnn() {
   # TODO(guangyang): Make QNN chipset matches the target device
   QNN_CHIPSET=SM8450
 
-  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only
+  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only $EXTRA_FLAGS
   EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit)
 }
 
 
@@ -229,6 +229,28 @@ jobs:
         # see if we can import the module successfully
         ${CONDA_RUN} python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
 
+  test-static-llama-ane:
+    name: test-static-llama-ane
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-stable
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+        bash .ci/scripts/setup-conda.sh
+        eval "$(conda shell.bash hook)"
+
+        # Install requirements
+        sh install_requirements.sh
+        sh backends/apple/coreml/scripts/install_requirements.sh
+        python install_executorch.py --pybind coreml
+        sh examples/models/llama/install_requirements.sh
+
+        # Test ANE llama
+        sh .ci/scripts/test_ane_static_llama.sh
+
   test-llama-runner-macos:
     name: test-llama-runner-mac
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
@@ -311,7 +333,7 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        model: [dl3, mv3, mv2, ic4, ic3, vit]
+        model: [dl3, mv3, mv2, ic4, ic3, vit, mb, w2l]
       fail-fast: false
     with:
       runner: linux.2xlarge
 
@@ -9,5 +9,6 @@ python_library(
         "//executorch/backends/transforms:replace_scalar_with_tensor",
         "//executorch/backends/xnnpack/_passes:xnnpack_passes",
         "//executorch/exir:lib",
+        "//executorch/backends/transforms:utils",
     ],
 )
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
@@ -6,10 +6,15 @@
 # pyre-unsafe
 
 import torch
+from executorch.backends.transforms.utils import (
+    create_constant_placeholder,
+    delete_constant_placeholder,
+)
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch._export.utils import get_buffer, get_param
+from torch.export.graph_signature import InputKind
 from torch.fx import Node
 from torch.nn.utils.fusion import fuse_conv_bn_weights
 
@@ -23,7 +28,7 @@ def __init__(self, exported_program: ExportedProgram):
         self.exported_program = exported_program
         super().__init__()
 
-    def is_fuseable_conv_bn(self, node: Node):
+    def is_fuseable_conv_bn(self, node: Node) -> bool:
         """Returns True if node is a batchnorm that can be fused into
         a parent convolution."""
         if node.op != "call_function":
@@ -44,15 +49,19 @@ def is_fuseable_conv_bn(self, node: Node):
         # Since we change the output of the conv, fuse only if it has single user.
         if len(conv.users) > 1:
             return False
-        # For similar reasons, only fuse if conv parameters have single user.
-        if len(conv.all_input_nodes[1].users) > 1:
-            return False
-        if len(conv.all_input_nodes) > 2 and len(conv.all_input_nodes[2].users) > 1:
-            return False
         return True
 
+    def get_bias_name(self, conv_weight_node: Node, conv_bias_node: Node) -> str:
+        if conv_bias_node:
+            return conv_bias_node.name + "_fused_bn"
+        elif "weight" in conv_weight_node.name:
+            return conv_weight_node.name.replace("weight", "bias") + "_fused_bn"
+        else:
+            return conv_weight_node.name + "_bias_fused_bn"
+
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:  # noqa: C901
         modified = False
+        constant_placeholders_to_delete = set()
         for node in graph_module.graph.nodes:
             if not self.is_fuseable_conv_bn(node):
                 continue
@@ -64,68 +73,93 @@ def get_param_or_none(arg) -> torch.nn.Parameter | None:
                 )
 
             # Get weight, bias, mean, var and epsilon from the batchnorm
-            bn = node
-            conv, bn_weight_node, bn_bias_node, bn_mean_node, bn_var_node = bn.args[0:5]
-            bn_weight = get_param_or_none(bn_weight_node)
-            bn_bias = get_param_or_none(bn_bias_node)
-
-            running_mean = get_buffer(self.exported_program, bn_mean_node)
-            running_var = get_buffer(self.exported_program, bn_var_node)
-            if running_mean is None or running_var is None:
+            bn_node = node
+            conv, bn_weight_node, bn_bias_node, bn_mean_node, bn_var_node = (
+                bn_node.args[0:5]
+            )
+            bn_weight_tensor = get_param_or_none(bn_weight_node)
+            bn_bias_tensor = get_param_or_none(bn_bias_node)
+            bn_mean_tensor = get_buffer(self.exported_program, bn_mean_node)
+            bn_var_tensor = get_buffer(self.exported_program, bn_var_node)
+            if bn_mean_tensor is None or bn_var_tensor is None:
                 raise ValueError(
                     "Parameters running_mean and running_var of batchnorm can't be None."
                 )
-            epsilon = bn.args[-1]
+            epsilon = bn_node.args[-1]
 
             # Get weight and bias from conv
             conv_weight_node, conv_bias_node = conv.args[1:3]
-            conv_weight = get_param(self.exported_program, conv_weight_node)
-            conv_bias = get_param_or_none(conv_bias_node)
-            if conv_weight is None:
+            conv_weight_tensor = get_param(self.exported_program, conv_weight_node)
+            conv_bias_tensor = get_param_or_none(conv_bias_node)
+            if conv_weight_tensor is None:
                 raise ValueError("Parameter weight of convolution can't be None.")
 
             # Compute conv parameters folded with batchnorm
             fused_conv_weight, fused_conv_bias = fuse_conv_bn_weights(
-                conv_weight,
-                conv_bias,
-                running_mean,
-                running_var,
+                conv_weight_tensor,
+                conv_bias_tensor,
+                bn_mean_tensor,
+                bn_var_tensor,
                 epsilon,
-                bn_weight,
-                bn_bias,
+                bn_weight_tensor,
+                bn_bias_tensor,
             )
 
-            # Set the conv parameters to fused value
-            def try_set_param(
-                param_node: Node | None, param_value: torch.nn.Parameter
-            ) -> bool:
-                """set_param but check if param_node is None first. Return True if param was set successfully, otherwise False."""
-                if param_node is not None:
-                    param_name = (
-                        self.exported_program.graph_signature.inputs_to_parameters[
-                            param_node.name
-                        ]
+            # Create fused weights and bias to conv and replace conv args
+            with graph_module.graph.inserting_before(conv_weight_node):
+                fused_conv_weight_node = create_constant_placeholder(
+                    exp_program=self.exported_program,
+                    graph=graph_module.graph,
+                    kind=InputKind.PARAMETER,
+                    name=conv_weight_node.name + "_fused_bn",
+                    data=fused_conv_weight,
+                )
+
+                if fused_conv_bias is not None:
+                    fused_conv_bias_node = create_constant_placeholder(
+                        exp_program=self.exported_program,
+                        graph=graph_module.graph,
+                        kind=InputKind.PARAMETER,
+                        name=self.get_bias_name(conv_weight_node, conv_bias_node),
+                        data=fused_conv_bias,
                     )
-                    self.exported_program.state_dict[param_name] = param_value
-                    return True
-                return False
+                else:
+                    fused_conv_bias_node = None
+
+                conv.args = (
+                    conv.args[0],
+                    fused_conv_weight_node,
+                    fused_conv_bias_node,
+                    *conv.args[3:],
+                )
 
-            try_set_param(conv_weight_node, fused_conv_weight)
-            if not try_set_param(conv_bias_node, fused_conv_bias) and try_set_param(
-                bn_bias_node, fused_conv_bias
-            ):
-                # pyre-ignore[60]
-                # Conv didn't have bias but batchnorm did, steal bias from batchnorm.
-                conv_args = (*conv.args[0:2], bn_bias_node, *conv.args[3:])
-                conv.args = conv_args
-
-            # Erasing nodes is handled by dead-code elimination.
-            for user in bn.users:
+            # Erasing batch-norm nodes is handled by dead-code elimination. After that we may remove their constant placeholder inputs
+            for user in bn_node.users:
                 user.replace_all_uses_with(conv)
+
+            constant_placeholders_to_delete.update(
+                [
+                    bn_weight_node,
+                    bn_bias_node,
+                    bn_mean_node,
+                    bn_var_node,
+                    conv_weight_node,
+                    conv_bias_node,
+                ]
+            )
             modified = True
 
         if modified:
             graph_module.graph.eliminate_dead_code()
+            for constant_placeholder in constant_placeholders_to_delete:
+                if (constant_placeholder is not None) and (
+                    len(constant_placeholder.users) == 0
+                ):
+                    delete_constant_placeholder(
+                        self.exported_program, constant_placeholder
+                    )
+
             graph_module.recompile()
             graph_module = super().call(graph_module).graph_module
+
         return PassResult(graph_module=graph_module, modified=modified)
@@ -131,7 +131,6 @@ def test_w2l_u55_BI(self):
 
     @pytest.mark.slow
     @pytest.mark.corstone_fvp
-    @unittest.skip("Blocked by MLBEDSW-10420")
     @conftest.expectedFailureOnFVP  # TODO: MLBEDSW-10093
     def test_w2l_u85_BI(self):
         tester = self._test_w2l_ethos_BI_pipeline(
 
@@ -85,13 +85,13 @@ def forward(self, x):
         return x
 
 
-class MergeNoBN(torch.nn.Module):
+class MergeMultipleUsersBN(torch.nn.Module):
     ops_before_pass = {
         "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default": 2,
         "executorch_exir_dialects_edge__ops_aten_convolution_default": 3,
     }
     ops_after_pass = {
-        "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default": 2,
+        "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default": 1,
         "executorch_exir_dialects_edge__ops_aten_convolution_default": 3,
     }
 
@@ -122,7 +122,7 @@ def forward(self, x):
         z = self.conv2d2(x)
         a = self.batch_norm2d(
             y
-        )  # Can't be fused since paramters of conv2d2 have multiple users.
+        )  # Can be fused despite paramters of conv2d2 having multiple users.
 
         return z, a
 
@@ -131,7 +131,7 @@ def forward(self, x):
     "merge_one_of_two_bn_affine": MergeOneOfTwoBN(True),
     "merge_one_of_two_bn": MergeOneOfTwoBN(False),
     "merge_two_of_two_bn_affine": MergeTwosOfTwoBN(True),
-    "merge_no_bn_affine": MergeNoBN(True),
+    "merge_multiple_users_bn_affine": MergeMultipleUsersBN(True),
 }
 
 
 
@@ -104,6 +104,16 @@ def count_node(graph_module: torch.fx.GraphModule, target: torch.fx.node.Target)
     return total
 
 
+def op_counts_match(
+    graph_module: torch.fx.GraphModule,
+    expected_op_counts: dict[EdgeOpOverload, int],
+) -> bool:
+    for op, count in expected_op_counts.items():
+        if count_node(graph_module, op) != count:
+            return False
+    return True
+
+
 # Testing utils
 # Return the compute/function nodes in the graph
 def get_compute_nodes_in_gm(graph_module: torch.fx.GraphModule) -> List[torch.fx.Node]:
Original file line number	Diff line number	Diff line change
`@@ -9,5 +9,6 @@ python_library(`
`9`	`9`	`"//executorch/backends/transforms:replace_scalar_with_tensor",`
`10`	`10`	`"//executorch/backends/xnnpack/_passes:xnnpack_passes",`
`11`	`11`	`"//executorch/exir:lib",`
	`12`	`+ "//executorch/backends/transforms:utils",`
`12`	`13`	`],`
`13`	`14`	`)`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`# Copyright (c) Meta Platforms, Inc. and affiliates.`
`2`		`-# Copyright 2024-2025 Arm Limited and/or its affiliates.`
`3`	`2`	`# All rights reserved.`
	`3`	`+# Copyright 2024-2025 Arm Limited and/or its affiliates.`
`4`	`4`	`#`
`5`	`5`	`# This source code is licensed under the BSD-style license found in the`
`6`	`6`	`# LICENSE file in the root directory of this source tree.`